Transcriptome summary for patient sample test_sample_WTS.

##### We attempt to structure the script in the following way:
# 1. Defining functions
# 2. Loading libraries
# 3. Loading sample data and reference datasets
# Then... code chunks involving data processing
# Then... code chunks calling the processed data to produce tables / plots / data summary
# Finish with Session info in Addendum section

##### The processed data is stored in "ref_dataset.list" list variable with elements holding the following data:
# 1. ref_dataset.list[[dataset]][["combined_data"]] = combined read count data (reference datasets + sample data) ("combineDatasets" function output in the "load_ref_data chunk")
# 2. ref_dataset.list[[dataset]][["sample_annot"]] = combined data samples annotation ("combineDatasets" function output in the "load_ref_data chunk")
# 3. ref_dataset.list[[dataset]][["clinical_info"]] = clinical information (survival and treatment info)
# 4. ref_dataset.list[[dataset]][["combined_data_processed"]] = transformed, filtered and normalised data (see "data_transformation" and "data_normalisation" chunks)
# 5. ref_dataset.list[[dataset]][["batch_effect_corrected"]] = transformed, filtered, normalised and batch effect corrected data (see "batch_effect_correction" chunk)
# 6. ref_dataset.list[[dataset]][["pca_combined_data_processed"]] = PCA results for combined data
# 7. ref_dataset.list[[dataset]][["pca_batch_effect_corrected"]] = PCA results for batch-effect corrected data
# 8. ref_dataset.list[[dataset]][["rle_combined_data_processed"]] = RLE plot for combined data
# 9. ref_dataset.list[[dataset]][["rle_batch_effect_corrected"]] = RLE plot for batch-effect corrected data
# 10. ref_dataset.list[[dataset]][["data_to_report"]] = Fully combined and processed data to be used for reporting
# 11. ref_dataset.list[[dataset]][["gene_annot_all"]] = gene annotation for combined read count data, containing all input genes. The annotation includes "SYMBOL", "GENEBIOTYPE", "ENSEMBL", "SEQNAME", "GENESEQSTART", "GENESEQEND". "ENSEMBL" is used for rownames
# 12. ref_dataset.list[[dataset]][["gene_annot"]] = gene annotation for transformed, filtered and normalised data. The annotation includes "SYMBOL", "GENEBIOTYPE", "ENSEMBL", "SEQNAME", "GENESEQSTART", "GENESEQEND". "SYMBOL" is used for rownames
# 13. ref_dataset.list[[dataset]][["expr_mut_cn_data_all"]] = combined expression, mutation and copy-number data
# 14. ref_dataset.list[[dataset]][["expr_mut_cn_data"]] = combined expression, mutation and copy-number data limited to cancer genes that meet user-deinfed CN values threshold

##### Genes of interest are stored in "ref_genes.list" list variable with elements holding the following gene sets:
# 1. ref_genes.list[["genes_cancer"]] = list of cancer genes derived from UMCCR Cancer Gene list (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv) and OncoKB portal (http://oncokb.org/#/cancerGenes) 
# 2. ref_genes.list[["genes_oncokb"]] = list of cancer genes derived from OncoKB portal (http://oncokb.org/#/cancerGenes) alone (although genes present on the UMCCR panel are also flagged)
# 3. ref_genes.list[["genes_immune"]] = list of immune reponse markers provided in the "An Immunogram for the Cancer-Immunity Cycle" paper by Karasaki at al (2017) (https://www.ncbi.nlm.nih.gov/pubmed/28088513) and OmniSeq report (https://www.omniseq.com/) 
# 4. ref_genes.list[["genes_hrd"]] = list of hrd (homologous recombination deficiency) genes
# 5. ref_genes.list[["pcgr"]] = list and PCGR annotation of mutated genes in given patient based on PCGR report
# 6. ref_genes.list[["purple"]] = list and PURPLE annotation of copy-number (CN) altered genes in given patient based on PURPLE results
# 7. ref_genes.list[["manta"]] = list and MANTA annotation of structural variants (SVs) with affected genes in given patient based on MANTA results
# 8. ref_genes.list[["arriba"]] = list and ARRIBA annotation of gene fusion events detected in given patient based on ARRIBA results
# 9. ref_genes.list[["pizzly"]] = list and PIZZLY annotation of gene fusion events detected in given patient based on PIZZLY results
# 10. ref_genes.list[["summary"]] = summary of above-mentioned gene lists. These gene lists are also used for generating expression summary tables and plots in individual report sections
NOW <- Sys.time()

##### Time chunks during knitting
knitr::knit_hooks$set(timeit = function(before) {
  
  if (before) {
    print(paste("Start:", Sys.time()))
    NOW <<- Sys.time()
  } else {
    print(paste("Stop:", Sys.time()))
    print(Sys.time() - NOW)
  }
})

knitr::opts_chunk$set(timeit = TRUE)
##### Define functions
##### Create 'not in' operator
"%!in%" <- function(x,table) match(x,table, nomatch = 0) == 0

##### Prepare object to write into a file
prepare2write <- function (x) {
  
  x2write <- cbind(rownames(x), x)
  colnames(x2write) <- c("",colnames(x))
  
  ##### Clean the space and return output
  rm(x)
  return(x2write)
}

##### Combine sample expression profile with reference datasets. This function outputs a vector with first element containing the merged data and second element containing merged targets info
combineDatasets <- function(sample_name, sample_counts, ref_data, report_dir, dataset) {
  
  ##### Extract info about target file for the external reference dataset
  target.ext <- read.table(ref_data[["ext_ref"]][2], sep="\t", as.is=TRUE, header=TRUE)
  target.ext <- cbind(target.ext, rep(ref_data[["ext_ref"]][3], nrow(target.ext)))
  colnames(target.ext)[ncol(target.ext)] <- "Dataset"
  
  ##### Add prexit to sample names
  rownames(target.ext) <- paste(target.ext[,"Dataset"], target.ext[,"Sample_name"], sep = ".")
  target.ext <- target.ext[, -1]
  
  ##### Extract info about target file for the internal reference dataset
  target.int <- read.table(ref_data[["int_ref"]][2], sep="\t", as.is=TRUE, header=TRUE)
  target.int <- cbind(target.int, rep(ref_data[["int_ref"]][3], nrow(target.int)))
  colnames(target.int)[ncol(target.int)] <- "Dataset"
      
  ##### Add prexit to sample names
  rownames(target.int) <- paste(target.int[,"Dataset"], target.int[,"Sample_name"], sep = ".")
  target.int <- target.int[, -1]
      
  target.comb <- rbind(target.ext, target.int)
  
  ##### Add sample info
  target.sample <- data.frame(sample_name, sample_name)
  names(target.sample) <- names(target.comb)
  rownames(target.sample) <- sample_name
  target.comb <- rbind( target.comb, target.sample )
  
  ##### Make syntactically valid names
  rownames(target.comb) <- make.names(rownames(target.comb))
  
  ##### Read sample read count file and combine it with reference datasets
  datasets.comb <- sample_counts
  names(datasets.comb) <- c("", sample_name)
      
  ##### list genes present in the sample read count file
  gene_list <- as.vector(datasets.comb[,1])
      
  ##### Loop through the expression data from different datasets and merge them into one matrix
  for ( i in 1:length(ref_data) ) {
    
    dataset.counts <- as.data.frame( read.table(gzfile(ref_data[[i]][1]), header=TRUE, sep="\t", row.names=NULL) )
    
    ##### Add prexit to sample names
    colnames(dataset.counts) <- paste(unique(target.comb[,"Dataset"])[i], colnames(dataset.counts), sep = ".")
    
    ##### List genes present in individal files
    gene_list <- c( gene_list, as.vector(dataset.counts[,1]) )
    
    ##### Merge the expression datasets and make sure that the genes order is the same
    datasets.comb <- merge( datasets.comb, dataset.counts, by=1, all = FALSE, sort= TRUE)
  }
  
  ##### Use gene IDs as rownames
  rownames(datasets.comb) <- datasets.comb[,1]
  datasets.comb <- datasets.comb[, -1]
  
  ##### Make syntactically valid names
  colnames(datasets.comb) <- make.names(colnames(datasets.comb))
  
  ##### Make sure that the target file contains info only about samples present in the data matrix
  target.comb <- target.comb[ rownames(target.comb) %in% colnames(datasets.comb),  ]
  
  ##### Make sure that the samples order in the data matrix is the same as in the target file 
  datasets.comb <- datasets.comb[ , rownames(target.comb) ]
  
  ##### Identify genes that were not present across all per-sampel files and were ommited in the merged matrix
  gene_list <- unique(gene_list)
  gene_list.missing <- gene_list[ gene_list %!in% rownames(datasets.comb) ]
  
  ##### Write list of missing genes into a file
  if ( length(gene_list.missing) > 0 ) {
    write.table(prepare2write(gene_list.missing), file = paste0(report_dir, "/", sample_name, ".RNAseq_report.missing_genes.txt"), sep="\t", quote=FALSE, row.names=TRUE, append = FALSE )
  }
  
  ##### Clean the space and return output
  rm(sample_name, sample_counts, ref_data, target.ext, target.int, target.sample, dataset.counts, gene_list, gene_list.missing)
  return( list(datasets.comb, target.comb) )
}

##### Assign colours to different elements
getColours <- function(elements) {
  
  ##### Predefined selection of colours for elements
  if ( length(unique(elements)) == 3 ) {
    elements.colours <- c("powderblue", "red", "gray50")
  } else if ( length(unique(elements)) == 4 ) {
    elements.colours <- c("powderblue", "forestgreen", "red", "gray50")
  } else {
    elements.colours <- rainbow(length(elements))
  }
  
  f.elements <- factor(elements, levels = unique(elements))
  vec.elements <- elements.colours[1:length(levels(f.elements))]
  elements.colour <- rep(0,length(f.elements))
  for (i in 1:length(f.elements))
    elements.colour[i] <- vec.elements[ f.elements[i]==levels(f.elements)]
  
  return( list(vec.elements, elements.colour) )
}

##### Calculate TPM from RPKM (from http://luisvalesilva.com/datasimple/rna-seq_units.html )
tpm_from_rpkm <- function(x) {
  rpkm.sum <- colSums(x)
  return(t(t(x) / (1e-06 * rpkm.sum)))
}

##### Function to generate a full-resolution pdf image before generating a small image in the chunk (from https://stackoverflow.com/questions/37834053/what-is-a-simple-way-to-thumbnail-some-plots-in-r-markdown-knitr )
allow_thumbnails <- function(x, options) {
  if (!is.null(options$thumb)) {
    filename <- sprintf("%s.full.pdf", strsplit(basename(x), "\\.")[[1]][1])
    absolute_path <- file.path(dirname(x), filename)

    ##### Generate the full resolution pdf
    pdf(absolute_path, width = options$thumb$width, height = options$thumb$height)
      eval(parse(text = options$code))
    dev.off()

    ##### Add an html link to the low resolution png
    options$fig.link = absolute_path
  }

  knitr:::hook_plot_md_base(x, options)
}

##### Perform PCA. This function outputs a list with dataframe and samples colouring info ready for plotting
pca <- function(data, targets, title = "", report_dir, suffix = "" ) {

  ##### Keep only genes with variance > 0 across all samples
  rsd <- apply(data,1,sd)
  data.subset <- data[rsd>0,]
  
  ##### Perform PCA
  data.subset_pca <- prcomp(t(data.subset), scale=FALSE)
  
  ##### Get variance importance for all principal components
  importance_pca <- summary(data.subset_pca)$importance[2,]
  importance_pca <- paste(round(100*importance_pca, 2), "%", sep="")
  names(importance_pca) <- names(summary(data.subset_pca)$importance[2,])
    
  ##### Prepare data frame
  data.subset_pca.df <- data.frame(targets$Target, targets$Dataset, data.subset_pca$x[,"PC1"], data.subset_pca$x[,"PC2"], data.subset_pca$x[,"PC3"])
  colnames(data.subset_pca.df) <- c("Target", "Dataset", "PC1", "PC2", "PC3")
  
  ##### Assigne colours to targets and datasets
  targets.colour <- getColours(targets$Target)
  datasets.colour <- getColours(targets$Dataset)
  
  ##### Create a list with dataframe and samples colouring info
  pca.list <- list(data.subset_pca.df, importance_pca, targets.colour, datasets.colour)
  names(pca.list) <- c("pca.df", "importance_pca", "targets", "datasets")
  
  ##### Change the datasets levels order
  data.subset_pca.df$Target <- factor(data.subset_pca.df$Target, levels = unique(data.subset_pca.df$Target))
  
  ##### Generate PCA 2-D plot
  pca_plot <- plot_ly(data.subset_pca.df, x = ~PC1, y = ~PC2, color = ~Target, text=paste(targets$Target, rownames(data.subset_pca.df), sep=": "), colors = targets.colour[[1]], type='scatter', mode = "markers", marker = list(size=10, opacity = 0.7), width = 800, height = 500) %>%
  layout(title = title, xaxis = list(title = paste( "PC1", " (",importance_pca["PC1"],")",sep="")), yaxis = list(title = paste( "PC2", " (",importance_pca["PC2"],")",sep="")), margin = list(l=50, r=50, b=50, t=30, pad=4), autosize = FALSE, showlegend = TRUE, legend = list(orientation = "v", y = 0.9))

  ##### Generate Scree-plot
  data.subset_scree.df <- data.frame(paste0("PC ", c(1:length(importance_pca))), as.numeric(gsub("%", "",importance_pca)))
colnames(data.subset_scree.df) <- c("PC", "Variances")

  ##### The default order will be alphabetized unless specified as below
  data.subset_scree.df$PC <- factor(data.subset_scree.df$PC, levels = data.subset_scree.df[["PC"]])
  
  scree_plot <- plot_ly(data.subset_scree.df, x = ~PC, y = ~Variances, type = 'bar', width = 800, height = 350) %>%
    layout(title = title, xaxis = list(title = ""), margin = list(l=50, r=50, b=100, t=30, pad=4), autosize = F)
  
  ##### Create directory for the plots
  PCAplotDir <- paste(report_dir, "InputDataPlots", sep = "/")
  if ( !file.exists(PCAplotDir) ) {
    dir.create(PCAplotDir, recursive=TRUE)
  }
  
  ##### Save interactive plot as html file
  saveWidgetFix(pca_plot, file = paste0(PCAplotDir, "/pca_plot", suffix, ".html"))
  saveWidgetFix(scree_plot, file = paste0(PCAplotDir, "/scree_plot", suffix, ".html"))
  
  return( list(pca.list, pca_plot, scree_plot) )
  
  ##### Clean the space
  rm(data, targets, rsd, data.subset, data.subset_pca, importance_pca, data.subset_pca.df, targets.colour, datasets.colour, pca.list, data.subset_scree.df, PlotsDir)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Convert a vector of numbers into corresponding vector of their percentiles
perc.rank <- function(x) trunc(rank(x))*100/length(x)

##### Perform range standardization between 0 and 1 (for the cumulative sums)
standardization <- function(x) c(x-min(x))/(max(x)-min(x))

##### Calculating cumulative sum for while keeping the original data order
cumsum_ordered <- function(x) {
  
  ##### Perform range standardization between 0 and 1, otherwise the negative values are summed up
  standarised <- standardization(x)
  
  ##### Sort and cumsum values
  sorted_cumsum <- cumsum(sort(standarised))
  
  ##### Restore the original elements order
  ordered_cumsum <- sorted_cumsum[ names(standarised) ]
  
  ##### Perform range standardization between 0 and 1, otherwise the negative values are summed up
  standarised_cumsum <- standardization(ordered_cumsum)
  
  ##### Clean the space and return output
  rm(x, standarised, sorted_cumsum, ordered_cumsum)
  return( standarised_cumsum )
}

##### Check for nearest value in a vector
nearest_position <- function(vector, x) {
  
  y <- which.min(abs(vector - x))
  
  ##### Clean the space and return output
  rm(vector, x)
  return( y )
}

##### Calculate gene-wise median, sd, quantiles and cumulative franctions for expression data
exprGroupsStats_geneWise <- function(data, targets) {
  
  ##### Perform Z-score transformation of the expression values
  data.z <- t(apply(data, 1, scale, scale = TRUE))
  colnames(data.z) <- colnames(data)
  
  ##### Remove rows with potential NA's, which is due to SD = 0 across all samples
  data.z <- data.z[rowSums(!is.na(data.z)) > 0, , drop = FALSE]
  data <- data[ rownames(data) %in% rownames(data.z), , drop = FALSE]
  
  ##### Perform the gene-wise calculations across all groups
  ##### Convert a expression values into corresponding percentiles
  data.q <- t(apply(data, 1, perc.rank))
 
  ##### Calculate cumulative sums and perform range standardization between 0 and 1
  data.cum <- t(apply(data, 1, cumsum_ordered))
 
  ##### Create lists with stats for each group and gene
  targets.list <- unique(targets$Target)
  group_stats.list <- vector("list", length(targets.list))
  names(group_stats.list) <- targets.list
  
  #### For each group...
  for ( group in targets.list ) {
    
    ##### For groups with > 1 sample get the median values for each gene
    if ( sum(c(targets$Target %in% group), na.rm = TRUE) > 1 && nrow(data) > 1 )  {
      
      ##### Extract the median expression values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], rowMedians(data[ , colnames(data)[ targets$Target %in% group ] ]))

      ##### Extract the expression sd values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], rowSds(data[ , colnames(data)[ targets$Target %in% group ] ]))
      
      ##### Extract the median Z-scores
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], rowMedians(data.z[ , colnames(data)[ targets$Target %in% group ] ]))

      ##### Extract the median percentiles
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], rowMedians(data.q[ , colnames(data)[ targets$Target %in% group ] ]))
      
      ##### Extract the cumulative fraction corresponding to the median Z-score
      ##### First, need to get the position of the Z-score nearest to the median Z-score, and then extract the cumulative value at this position
      data.z.median_pos <- apply(data.z, 1, nearest_position, median(data.z[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data.cum[ data.z.median_pos ] )
      
      group_stats.list[[group]] <- as.data.frame(group_stats.list[[group]])
      names( group_stats.list[[group]] ) <- c("median", "sd", "z", "quantile", "cum")
      rownames( group_stats.list[[group]] ) <- rownames(data)
      
    } else if ( sum(c(targets$Target %in% group), na.rm = TRUE) > 1 && nrow(data) == 1 ) {
      
      ##### Extract the median expression values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], median(data[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))

      ##### Extract the expression sd values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], sd(data[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))
      
      ##### Extract the median Z-scores
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], median(data.z[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))

      ##### Extract the median percentiles
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], median(data.q[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))
      
      ##### Extract the cumulative fraction corresponding to the median Z-score
      ##### First, need to get the position of the Z-score nearest to the median Z-score, and then extract the cumulative value at this position
      data.z.median_pos <- nearest_position( data.z, median(data.z[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data.cum[ data.z.median_pos ] ) 

      group_stats.list[[group]] <- as.data.frame(group_stats.list[[group]])
      names( group_stats.list[[group]] ) <- c("median", "sd", "z", "quantile", "cum")
      rownames( group_stats.list[[group]] ) <- rownames(data)
      
    } else {

      ##### Extract the median expression values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data[ , colnames(data)[ targets$Target %in% group ] ])

      ##### Extract the expression sd values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], rep( NA, nrow(data)))
      
      ##### Extract the median Z-scores
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data.z[ , colnames(data)[ targets$Target %in% group ] ])

      ##### Extract the median percentiles
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data.q[ , colnames(data)[ targets$Target %in% group ] ])
      
      ##### Extract the median cumulative fraction
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data.cum[ , colnames(data)[ targets$Target %in% group ] ])
      
      group_stats.list[[group]] <- as.data.frame(group_stats.list[[group]])
      names( group_stats.list[[group]] ) <- c("median", "sd", "z", "quantile","cum")
      rownames( group_stats.list[[group]] ) <- rownames(data)
     }
  }
  
  ##### Finally, extract cumulative values for each gene within individual groups
  gene_stats.list <- vector("list", length(targets.list))
  names(gene_stats.list) <- targets.list
  
  #### For each group...
  for ( group in targets.list ) {
    
    ##### Extract per-gene expression values
    gene_stats.list[[group]]$median <- data[ , colnames(data)[ targets$Target %in% group ], drop = FALSE ]
    
    ##### Extract per-gene z-score values
    gene_stats.list[[group]]$z <- data.z[ , colnames(data.z)[ targets$Target %in% group ], drop = FALSE ]
    
    ##### Extract per-gene percentile values
    gene_stats.list[[group]]$q <- data.q[ , colnames(data.q)[ targets$Target %in% group ], drop = FALSE ]
    
    ##### Extract per-gene cumulative values
    gene_stats.list[[group]]$cum <- data.cum[ , colnames(data.cum)[ targets$Target %in% group ], drop = FALSE ]
  }
  
  ##### Clean the space and return output
  rm(data, targets, data.z, data.q, data.cum, targets.list, data.z.median_pos)
  return( list( group_stats.list, gene_stats.list) )
}

##### Calculate group-wise median, sd, quantiles and cumulative franctions for expression data from specific sample group
exprGroupStats_groupWise <- function(data, targets, target) {
  
  ##### Subset data for defined biological group
  data.group <- data[, targets$Target %in% target ]
  
  ##### For groups with > 1 sample get the median and standard deviation for each gene
  if ( !is.null(ncol(data.group)) )  {
    
    data.group.median <- rowMedians(data.group)
    names(data.group.median) <- rownames(data.group)
    data.group.median <- sort(data.group.median)
    data.group.sd <- rowSds(data.group)
    
  } else {
    data.group.median <- sort(data.group)
    data.group.sd <- rep( NA, length(data.group))
  }
  
  ##### Make sure the median and sd vectors have the same gene order
  names(data.group.sd) <- rownames(data.group)
  data.group.sd <- data.group.sd[names(data.group.median)]

  ##### Convert a expression values into corresponding percentiles
  data.group.q <- perc.rank(data.group.median)
  
  ##### Perform range standardization between 0 and 1 (for the cumulative sums), otherwise the negative values are summed up
  data.group.s <- sort(standardization(data.group.median))
  
  ##### Calculate cumulative sums and perform range standardization between 0 and 1 
  data.group.cum <- standardization(cumsum(data.group.s))
  
  ##### Perform Z-score transformation of the median expression values
  data.group.z <- scale(data.group.median, scale = FALSE)
  
  ##### Organise the data into data frame
  data.group.df <- as.data.frame(cbind( data.group.median, data.group.sd, data.group.z, data.group.q, data.group.cum))
  names(data.group.df) <- c("median", "sd", "z", "quantile", "cum")
  
  ##### Clean the space and return output
  rm(data, targets, target, data.group, data.group.median, data.group.sd, data.group.q, data.group.s, data.group.cum, data.group.z)
  return( data.group.df )
}

##### Generate cumulative distribution function (CDF) plot for selected gene. If option "addBoxPlot" = TRUE, then generate additional boxplot below to show the data variance for selected gene in individual groups
cdfPlot <- function(gene, data, targets, sampleName, int_cancer, ext_cancer, comp_cancer, add_cancer = NULL, addBoxPlot = FALSE, scaling = "gene-wise", report_dir) {
  
  ##### Remove the internal reference cohort data if the patient samples origins from other tissue. Of note, the internal reference cohort was only used to process the in-house data (including the investigated patient sample) and to correct batch-effects
  if ( comp_cancer != int_cancer ) {
    targets <- targets[ targets$Target %!in% int_cancer, ]
    data <- data[ ,rownames(targets) ]
  }
  
  ##### Initiate lists with stats for each group
  targets.list <- unique(targets$Target)
  group.z <- vector("list", length(targets.list))
  names(group.z) <- targets.list
  
  ##### .... and for selected gene
  group.z.gene <- vector("list", length(targets.list))
  names(group.z.gene) <- targets.list

  ##### Get expression-related stats for each group
  ##### ... from gene-wise approach 
  if ( scaling == "gene-wise" ) {

    ##### Get stats for each group
    gene.data <- data[ gene, , drop = FALSE]
    group.z.gene <- exprGroupsStats_geneWise(gene.data, targets)[[1]]
    
    ##### ... and for each sample in individual groups
    gene.stats <- exprGroupsStats_geneWise(gene.data, targets)[[2]]

    for ( group in targets.list ) {
        group.z[[ group]] <- cbind(t(gene.stats[[ group]]$median), t(gene.stats[[ group]]$z), t(gene.stats[[ group]]$q), t(gene.stats[[ group]]$cum) )
        group.z[[ group]] <- as.data.frame(group.z[[ group]])
        colnames(group.z[[ group]]) <- c("median", "z", "quantile", "cum")
    }
    
    group.z[[ sampleName ]] <- do.call("rbind", group.z)
    
  ##### ... or from group-wise approach
  } else {
    group.z[[ sampleName ]] <- exprGroupStats_groupWise(data, targets, sampleName)
    group.z[[ ext_cancer ]] <- exprGroupStats_groupWise(data, targets, ext_cancer)
    
    ##### Extract expression for selected genes
    group.z.gene[[ sampleName ]] <- group.z[[ sampleName ]][ rownames(group.z[[ sampleName ]]) %in% gene, ]
    group.z.gene[[ ext_cancer ]] <- group.z[[ ext_cancer ]][ rownames(group.z[[ ext_cancer ]]) %in% gene, ]
    
    ##### Add info for internal cohort
    if ( comp_cancer == int_cancer ) {
      group.z[[ int_cancer ]] <- exprGroupStats_groupWise(data, targets, int_cancer)
      group.z.gene[[ int_cancer ]] <- group.z[[ int_cancer ]][ rownames(group.z[[ int_cancer ]]) %in% gene, ]
    }
    
    ##### Add info for additional cancer type is specified
    if ( !is.null(add_cancer) ) {
      group.z[[ add_cancer ]] <- exprGroupStats_groupWise(data, targets, add_cancer)
      group.z.gene[[ add_cancer ]] <- group.z[[ add_cancer ]][ rownames(group.z[[ add_cancer ]]) %in% gene, ]
    }
  }
  
  ##### Generate box-plot for selected gene
  if ( addBoxPlot ) {
    ##### Perform Z-score transformation of the median expression values
    if ( scaling == "gene-wise" ) {
      
      data.z <- t(scale(t(data)))
    } else {
      data.z <- scale(data, scale = FALSE)
    }
    
    targets$Target[ targets$Target==sampleName ] <- "Patient"
    gene.expr.df <- data.frame(targets$Target, data.z[gene, ])
    colnames(gene.expr.df) <- c("Group", "Expression")
    
    ##### Reorder groups
    if ( !is.null(add_cancer) ) {
      gene.expr.df$Group <- factor(gene.expr.df$Group, levels=c( add_cancer, ext_cancer, int_cancer, "Patient"))
      group.colours <- c("forestgreen", "cornflowerblue", "red", "black")
    } else {
      gene.expr.df$Group <- factor(gene.expr.df$Group, levels=c(ext_cancer, int_cancer, "Patient"))
      group.colours <- c("cornflowerblue", "red", "black")
    }
    
    p2 <- plot_ly(gene.expr.df, x= ~Expression, color = ~Group, type = 'box', jitter = 0.3, pointpos = 0, boxpoints = 'all', colors = group.colours, opacity = 0.5, orientation = 'h', width = 800, height = 400, showlegend=FALSE)
  }
  
  ##### Generate interactive CDF plot with plotly
  ##### Include the internal reference cohort in the plot
  if ( comp_cancer == int_cancer ) {
    p1 <- plot_ly(group.z[[ sampleName ]], x = ~z, color = I("black"), width = 700, height = 200) %>%
    
      ##### Add sample data
      add_markers(y = group.z.gene[[ sampleName ]]$quantile, x = group.z.gene[[ sampleName ]]$z,
                  text = rownames(group.z.gene[[ sampleName ]] ),
                  name = "Patient",
                  marker = list(size = 12, color = "black"),
                  showlegend = TRUE) %>%
    
      add_lines(y = group.z[[ sampleName ]]$quantile, x = group.z[[ sampleName ]]$z, 
                line = list(color = "grey"),
                text = rownames( group.z[[ sampleName ]] ),
                name = "Patient", showlegend = FALSE) %>%
        
      ##### Add int_cancer data
      add_markers(y = group.z.gene[[ int_cancer ]]$quantile, x =  group.z.gene[[ int_cancer ]]$z,
                  text = rownames( group.z.gene[[ int_cancer ]]),
                  name = int_cancer,
                  marker = list(size = 12, opacity = 0.5, color = "red"),
                  showlegend = TRUE) %>%
    
      add_lines(y = group.z[[ int_cancer ]]$quantile, x = group.z[[ int_cancer ]]$z, opacity = 0.5,
                line = list(color = "red", dash = "dash"),
                text = rownames( group.z[[ int_cancer ]] ),
                name = int_cancer, showlegend = FALSE) %>%
          
      ##### Add ext_cancer data
      add_markers(y = group.z.gene[[ ext_cancer ]]$quantile, x =  group.z.gene[[ ext_cancer ]]$z,
                  text = rownames( group.z.gene[[ ext_cancer ]] ),
                  name = ext_cancer,
                  marker = list(size = 12, opacity = 0.5, color = "cornflowerblue"),
                  showlegend = TRUE) %>%
    
      add_lines(y = group.z[[ ext_cancer ]]$quantile, x = group.z[[ ext_cancer ]]$z, opacity = 0.5,
                line = list(color = "cornflowerblue", dash = "dash"),
                text = rownames( group.z[[ ext_cancer ]] ),
                name = ext_cancer, showlegend = FALSE) %>%
      
      ##### Add quantile lines
      add_lines(y = seq(0,100,10), x = rep(quantile(group.z[[ sampleName ]]$z)[2], 11), opacity = 0.5,
                line = list(color = "gray", dash = "dash"),
                name = "Q1", showlegend = FALSE) %>%
      
      add_lines(y = seq(0,100,10), x = rep(quantile(group.z[[ sampleName ]]$z)[3], 11), opacity = 0.5,
                line = list(color = "gray", dash = "dash"),
                name = "Q2", showlegend = FALSE) %>%
      
      add_lines(y = seq(0,100,10), x = rep(quantile(group.z[[ sampleName ]]$z)[4], 11), opacity = 0.5,
                line = list(color = "gray", dash = "dash"),
                name = "Q3", showlegend = FALSE) %>% 
      
          layout(title = gene, xaxis = list(title = "mRNA expression (Z-score)", zeroline = FALSE, range = c(min(group.z[[ sampleName ]]$z)-1.5, max(group.z[[ sampleName ]]$z)+1.5)),
             yaxis = list(title = "Percentile"),
             legend = list(orientation = 'v', x = 0.02, y = 1, bgcolor = "white")
      )
  
  ##### Skip the internal reference cohort in the plot
  } else {
    p1 <- plot_ly(group.z[[ sampleName ]], x = ~z, color = I("black"), width = 700, height = 200) %>%
  
    ##### Add sample data
    add_markers(y = group.z.gene[[ sampleName ]]$quantile, x = group.z.gene[[ sampleName ]]$z,
                text = rownames(group.z.gene[[ sampleName ]] ),
                name = "Patient",
                marker = list(size = 12, color = "black"),
                showlegend = TRUE) %>%
  
    add_lines(y = group.z[[ sampleName ]]$quantile, x = group.z[[ sampleName ]]$z, 
              line = list(color = "grey"),
              text = rownames( group.z[[ sampleName ]] ),
              name = "Patient", showlegend = FALSE) %>%
        
    ##### Add ext_cancer data
    add_markers(y = group.z.gene[[ ext_cancer ]]$quantile, x =  group.z.gene[[ ext_cancer ]]$z,
                text = rownames( group.z.gene[[ ext_cancer ]] ),
                name = ext_cancer,
                marker = list(size = 12, opacity = 0.5, color = "cornflowerblue"),
                showlegend = TRUE) %>%
  
    add_lines(y = group.z[[ ext_cancer ]]$quantile, x = group.z[[ ext_cancer ]]$z, opacity = 0.5,
              line = list(color = "cornflowerblue", dash = "dash"),
              text = rownames( group.z[[ ext_cancer ]] ),
              name = ext_cancer, showlegend = FALSE) %>%
    
    ##### Add quantile lines
    add_lines(y = seq(0,1,0.1), x = rep(quantile(group.z[[ sampleName ]]$z)[2], 11), opacity = 0.5,
              line = list(color = "gray", dash = "dash"),
              name = "Q1", showlegend = FALSE) %>%
    
    add_lines(y = seq(0,1,0.1), x = rep(quantile(group.z[[ sampleName ]]$z)[3], 11), opacity = 0.5,
              line = list(color = "gray", dash = "dash"),
              name = "Q2", showlegend = FALSE) %>%
    
    add_lines(y = seq(0,1,0.1), x = rep(quantile(group.z[[ sampleName ]]$z)[4], 11), opacity = 0.5,
              line = list(color = "gray", dash = "dash"),
              name = "Q3", showlegend = FALSE) %>% 
    
        layout(title = gene, xaxis = list(title = "mRNA expression (Z-score)", zeroline = FALSE, range = c(min(group.z[[ sampleName ]]$z)-1.5, max(group.z[[ sampleName ]]$z)+1.5)),
           yaxis = list(title = "Percentile"),
           legend = list(orientation = 'v', x = 0.02, y = 1, bgcolor = "white")
    )
  }
  
  ##### Combine CDF plot with boxplot if this option is selected
  if ( addBoxPlot ) {
    p1_2 <- subplot(p1, p2, nrows = 2, shareX = TRUE, shareY = FALSE, titleY = TRUE, heights = c(0.7, 0.3)) %>%
  layout(xaxis = list(title = "mRNA expression (Z-score)", zeroline = FALSE, range = c(min(group.z[[ sampleName ]]$z)-1.5, max(group.z[[ sampleName ]]$z)+1.5)),
          yaxis = list(title = "Percentile"),
          legend = list(orientation = 'v', x = 0.02, y = 1, bgcolor = "white"),
          yaxis2 = list( title =""), xaxis2 = list(title = paste0(gene, " mRNA expression (Z-score)")), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = FALSE,
         showlegend=TRUE, showlegend2=FALSE)
    
    return( p1_2 )
    
  } else {
    return( p1 )
  }
  ##### Clean the space
  rm(gene, targets, data, sampleName, targets.list, group.z, group.z.gene, gene.data, gene.stats, data.z, gene.expr.df, group.colours)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Convert density to counts
density2freq <- function(density) {
  freq = length(density)/sum(density) * density
  return(freq)
}

##### Generate density and expression distribution plots for selected gene, highlighting samples of interest
densityPlot <- function(gene, data, main_title, x_title, sampleName, distributions = NULL, scaling = "gene-wise") {
  
  if ( scaling == "gene-wise" ) {
    data.z <- t(scale(t(data)))
  } else {
    data.z <- scale(data, scale = FALSE)
  }
  
  ##### Used data for user-defined genes
  data.z <- data.z[ gene, ,drop=FALSE]

  ##### Create data frame and fill it with expression and density values for each sample for selected gene
  data.df <- data.frame(gene = "Observed distribution", sample = colnames(data.z)[order(data.z)], expr = sort(data.z), dens = density2freq(density(data.z, n=ncol(data.z))$y))
  
  ##### Generate values to generate various distributions
  if ( !is.null(distributions) ) {
    
    ##### Use the density values obtained from the expression values
    expr.sorted <- sort(data.z)
    
    ##### Get min and max values based on the expression data
    data.x <- seq(min(expr.sorted), max(expr.sorted), length.out = ncol(data.z))
    
    ##### Create empty data frame
    data.df.dist <- data.frame(matrix(ncol = 4, nrow = 0))
    colnames(data.df.dist) <- c("gene", "sample", "expr", "dens")
    
    ##### Generate y-values to mirror distributions of interest
    ##### Generate y-values for normal distribution. Useful resource https://stats.idre.ucla.edu/r/modules/probabilities-and-distributions/
    if ( "normal" %in% tolower(distributions) ) {
      data.y <- dnorm(data.x, mean = mean(data.x), sd = (max(data.x)-mean(data.x))/5)
      data.df.dist <- rbind(data.df.dist, data.frame(gene="Normal distribution", sample = colnames(data.z)[order(data.z)], expr=data.x, dens=density2freq(data.y)))
    } 
    
    ##### Generate x- and y-values for binomial distribution. Useful link https://stat.ethz.ch/R-manual/R-devel/library/stats/html/Binomial.html
    if ( "binomial" %in% tolower(distributions) ) {
      data.x <- 1:ncol(data.z)
      data.y <- dbinom(data.x, ncol(data.z), 0.25)
      data.x <- rescale(data.x, to = c(min(expr.sorted), max(expr.sorted)))
      data.df.dist <- rbind(data.df.dist, data.frame(gene="Binomial distribution (p=0.25)", sample = colnames(data.z)[order(data.z)], expr=data.x, dens=density2freq(data.y)))
      
      data.x <- 1:ncol(data.z)
      data.y <- dbinom(data.x, ncol(data.z), 0.75)
      data.x <- rescale(data.x, to = c(min(expr.sorted), max(expr.sorted)))
      data.df.dist <- rbind(data.df.dist, data.frame(gene="Binomial distribution (p=0.75)", sample = colnames(data.z)[order(data.z)], expr=data.x, dens=density2freq(data.y)))
    }
    
    ##### Draw n/2 samples from a normal distributions with one median and another n/2 samples from a second normal distribution with a different median. Useful link                  https://stats.stackexchange.com/questions/355344/simulating-a-bimodal-distribution-in-the-range-of-15-in-r
    if ( "bimodal" %in% tolower(distributions) ){
      data.x1 <- seq(min(expr.sorted), median(expr.sorted), length.out = ncol(data.z)/2)
      data.x2 <- seq(median(expr.sorted), max(expr.sorted), length.out = ncol(data.z)/2)
      
      ##### Combine both normal distributions to generate a bimodal distribution. Make sure the the length of this vector is equal to the number samples in the data
      data.x <- c(data.x1, data.x2)
      data.x <- data.x[1:ncol(data.z)]
      
      ##### Generate y-values for bimodal distribution
      data.y <- c(dnorm(data.x1, mean = mean(data.x1), sd = (max(data.x1)-mean(data.x1))/3), dnorm(data.x2, mean = mean(data.x2), sd = (max(data.x2)-mean(data.x2))/3))
      data.y <- data.y[1:ncol(data.z)]
      
      ##### Add bimodal dist values to the distribution dataframe
      data.df.dist <- rbind(data.df.dist, data.frame(gene = "Bimodal distribution", sample = colnames(data.z)[order(data.z)], expr = data.x, dens = density2freq(data.y)))
    }
    
    data.df <- rbind(data.df, data.df.dist)
    
    ##### Extract expression for selected sample in the distributions dataframe
    data.df.selected <- data.df[ sampleName == data.df$sample, ]
  }
  
  ##### Get min and max values based on the expression data
  den.x <- sort(data.df$expr)
  den.y <- sort(data.df$dens)
  
  ##### Assign colours to distributions
  genes.colour <- getColours(rev(unique(data.df$gene)))
  
  ##### Generate interactive density plot
  p <- plot_ly(data.df, x = ~expr, y = ~dens, type = 'scatter', mode = 'lines', color = ~gene, colors = genes.colour[[1]], width = 750, height = 200) %>%
    add_markers(y = data.df.selected$dens, x = data.df.selected$expr, 
                name = "Patient",
                text = "Patient",
                mode = 'markers',
                marker = list(size = 8, colors = data.df.selected$sample, color = rep(I("black"), each = nrow(data.df.selected)), line = list(color = "grey", width = 2)),
                showlegend = TRUE,
                inherit = FALSE) %>%
     layout(title = main_title,
           xaxis = list(title = x_title, range = c(den.x[1],den.x[length(den.x)])),
           yaxis = list (title = 'Weight', range = c(den.y[1],den.y[length(den.y)]), side = "right"),
           legend = list(orientation = 'h', y = 1.3))
  
  return( p )
  
  ##### Clean the space
  rm(gene, expr.sorted)
  rm(list = ls(pattern='^data*'))
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Generate box-plot for selected genes, highlighting samples of interest
barPlot <- function(gene, data, targets, y_title = "Counts", sampleName,  ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = NULL ) {

  ##### Used data for user-defined genes
  data <- data[ gene, ,drop=FALSE]
  
  ##### Prepare data frame
  targets$Target[ targets$Target==sampleName ] <- "Patient"
  rownames(targets)[ rownames(targets)==sampleName ] <- "Patient"
  data.df <- data.frame(targets$Target, rownames(targets), as.numeric(data))
  colnames(data.df) <- c("Group","Sample", "Data")
  
  ##### Reorder groups and add colours
  if ( !is.null(add_cancer) ) {
    data.df$Group <- factor(data.df$Group, levels=c( add_cancer, ext_cancer, int_cancer, "Patient"))
    group.colours <- c("forestgreen", "cornflowerblue", "red", "black")
  } else {
    data.df$Group <- factor(data.df$Group, levels=c(ext_cancer, int_cancer, "Patient"))
    group.colours <- c("cornflowerblue", "red", "black")
  }
  
  ##### The default order will be alphabetized unless specified as below
  data.df$Sample <- factor(data.df$Sample, levels = data.df[["Sample"]])
  p <- plot_ly(data.df, x = ~Sample, y = ~Data, color = ~Group, type = 'bar', colors = group.colours, width = 750, height = 200) %>%
    layout(title = "", xaxis = list( title = "", showticklabels = FALSE), yaxis = list(title = y_title), autosize = F, legend = list(orientation = 'h', y = 1.2), showlegend=TRUE)
  
  return( p )
  
  ##### Clean the space
  rm(list = ls(pattern='^data*'))
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Generate boxplot presenting expression profiles for selected set of genes
glanceExprPlot <- function(genes, data, targets, sampleName, int_cancer, ext_cancer, comp_cancer, add_cancer = NULL, hexcode, type = "z", sort = "diff", scaling = "gene-wise", report_dir) {
  
  if ( comp_cancer != int_cancer ) {
    targets <- targets[ targets$Target %!in% int_cancer, ]
    data <- data[ ,rownames(targets) ]
  }
  
  ##### Perform Z-score transformation of the median expression values
  if ( scaling == "gene-wise" ) {
    
    data.z <- t(scale(t(data)))
    y_title <- "mRNA expression (Z-score)"
    
    if ( type == "perc" ) {
      ##### Convert a expression values into corresponding percentiles
      data.z <- t(apply(data.z, 1, perc.rank))
      y_title <- "mRNA expression (percentile)"
    }
    
  } else {
    data.z <- scale(data, scale = FALSE)
    
    if ( type == "perc" ) {
      ##### Convert a expression values into corresponding percentiles
      data.z <- t(apply(data.z, 1, perc.rank))
    }
  }
  
  targets$Target[ targets$Target==sampleName ] <- "Patient"
  
  ##### Make sure that all genes are present in the expression matrix
  genes <- genes[ genes %in% rownames(data.z) ]
  
  ##### Genes sorting for visualisation
  ##### Sort genes by the greatest difference between the patient and the "comp_cancer" cohort
  if ( sort == "diff" ) {
    comp_cancer.medians <- rowMedians( data.z[ genes ,targets$Target==comp_cancer ] )
    names(comp_cancer.medians) <- genes
    comp_cancer.medians.diff <- comp_cancer.medians - data.z[ genes ,targets$Target=="Patient" ]
    genes <- genes[ order(comp_cancer.medians.diff) ]
  
  ##### Sort genes alphabetically
  } else if (sort == "alphabetically") {
    genes <- genes[ order(genes) ]
  }

  ##### Prepare dataframe for plotly
  gene.expr.df <- NULL
  
  for ( gene in genes ) {
    gene.expr.df <- rbind(gene.expr.df, data.frame(gene, targets$Target, data.z[gene, ]))
  }
  colnames(gene.expr.df) <- c("Gene", "Group", "Expression")
  
  ##### Reorder groups
  if ( !is.null(add_cancer) ) {
    gene.expr.df$Group <- factor(gene.expr.df$Group, levels=c("Patient", int_cancer, ext_cancer, add_cancer))
    group.colours <- c(I("black"), "red", "cornflowerblue", "forestgreen")
    
  } else {
    gene.expr.df$Group <- factor(gene.expr.df$Group, levels=c("Patient", int_cancer, ext_cancer))
    group.colours <- c(I("black"), "red", "cornflowerblue")
  }
  
  p <- plot_ly( gene.expr.df, x = ~Gene, y = ~Expression, color = ~Group, type = "box", colors = group.colours, opacity=0.3, showlegend = TRUE, width = 800, height = 400 ) %>% 
    add_markers(x = ~Gene[ gene.expr.df$Group %in% "Patient" ], y = ~Expression[ gene.expr.df$Group %in% "Patient" ], color = ~Group[ gene.expr.df$Group %in% "Patient" ], marker = list(size = 7), opacity=1, showlegend = FALSE) %>%
    
    layout(boxmode = "group", xaxis = list(title = ""), yaxis = list(title = y_title), legend = list( orientation = 'h', y = max(gene.expr.df$Expression), yancho = "top", bgcolor = "white"))
    
  ##### Create directory for "at glance" plots
  PlotsDir <- paste(report_dir, "glanceExprPlots", sep = "/")
    
  if ( !file.exists(PlotsDir) ) {
    dir.create(PlotsDir, recursive=TRUE)
  }
  
  ##### Save interactive plot as html file
  saveWidgetFix(p, file = paste(PlotsDir, paste0(hexcode, "_glance_expr_plot.", type, ".html"), sep = "/"))
  
  return( p )

  ##### Clean the space and return output
  rm(targets, data, sampleName, data.z, y_title, genes, comp_cancer.medians, comp_cancer.medians.diff, gene.expr.df, group.colours)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Generate scatterplot with per-gene expression values (y-axis), CN values (x-axis) and mutation status info (colours), if provided
mutCNexprPlot <- function(data, alt_data = FALSE, cn_bottom = cn_bottom, cn_top = cn_top, comp_cancer, type = "z", report_dir) {
  
  ##### Extract info for genes to be annotated on the plot
  genes2annot <- data[ data$CN >= cn_top | data$CN <= cn_bottom ,]$Gene
  
  if ( length(genes2annot) == 0 ) {
    genes2annot <- ""
  }
  
  if ( type == "z" ) {
    names(data)[ names(data) %in% "Z_score_diff" ] <- "Expr"
    y_title <- paste0("mRNA expression (Z-score [Patient vs ", comp_cancer, "])")
      
  } else if ( type == "perc" ) {
    names(data)[ names(data) %in% "Perc_diff" ] <- "Expr"
    y_title <- paste0("mRNA expression (percentile [Patient vs ", comp_cancer, "])")
  }
  
  ##### Generate scatterplot with per-gene expression values (y-axis) (difference between Patient's and [comp_cancer] data), CN values (x-axis) and mutation status info (colours)
  if ( alt_data ) {
    p <- plot_ly(type='scatter', mode = "markers", width = 800, height = 600, showlegend = FALSE) %>%
      
      add_markers(data = data, y = ~Expr, x = ~CN, 
                name = ~Gene,
                text = paste0("Gene: ", data$Gene,  "\nAlterations: ", data$Alterations),
                mode = 'markers',
                marker = list(size=10, symbol="circle"),
                color = ~Gene,
                showlegend = TRUE,
                legendtitle=TRUE, 
                inherit = FALSE) %>%
      
      add_annotations( data = data[ data$CN >= cn_top | data$CN <= cn_bottom ,], text=genes2annot,
                      x=~CN, xanchor="left",
                      y=~Expr, yanchor="top",
                      font = list(color = "Grey", size = 10),
                      legendtitle=TRUE, showarrow=FALSE ) %>%
      
      layout( xaxis = list(title = "CN value"), yaxis = list(title = y_title), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F, legend = list( orientation = 'v', x=1, y=0.97, yanchor="top"), showlegend=TRUE)
  
  ##### Generate scatterplot with per-gene expression values (y-axis) and CN values (x-axis)
  } else {
    p <- plot_ly(data, x = ~CN, y = ~Expr, text=~Gene, color = ~Gene, type='scatter', mode = "markers", marker = list(size=10, symbol="circle"), width = 800, height = 600) %>%
      
      add_annotations( data = data[ data$CN >= cn_top | data$CN <= cn_bottom ,], text=~Gene,
                      x=~CN, xanchor="left",
                      y=~Expr, yanchor="top",
                      font = list(color = "Grey",
                      size = 10),
                      legendtitle=TRUE, showarrow=FALSE ) %>%
      
      layout( xaxis = list(title = "CN value"), yaxis = list(title =  y_title), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F, legend = list( orientation = 'v', y=0.8, yanchor="top"), showlegend=TRUE)
  }
  
  ##### Create directory for the plots
  mutCNexprPlotDir <- paste(report_dir, "cn_expr_plot", sep = "/")
  if ( !file.exists(mutCNexprPlotDir) ) {
    dir.create(mutCNexprPlotDir, recursive=TRUE)
  }
  
  ##### Save interactive plot as html file
  saveWidgetFix(p, file = paste(mutCNexprPlotDir, paste0("cn_expr_plot.", type, ".html"), sep = "/"))
    
  return( p )
  
  ##### Clean the space and return output
  rm(data, alt_data, genes2annot, y_title)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Fusion visualisation 
arriba_plots <- function(arriba_file, arriba_results, results_dir) {

  ##### Get path to fusion visualisation  pdf file
  arriba_dir <- unlist(strsplit(arriba_file, split='/', fixed=TRUE))
  arriba_plots.pdf <- list.files(paste(arriba_dir[1:length(arriba_dir)-1], collapse = "/"), pattern="\\.pdf$")
  arriba_dir <- paste(arriba_dir[1:length(arriba_dir)-1], collapse = "/")
  arriba_plots.pdf <- paste(arriba_dir, arriba_plots.pdf, sep = "/")
    
  ##### Create directory for results
  if ( !file.exists(results_dir) ) {
    dir.create(results_dir, recursive=TRUE)
  }
  
  ##### Export pdf images to png
  for ( i in 1:nrow(arriba_results) ) {
    arriba_plots.png <- gsub(":", ".", paste0(results_dir, "/", make.names(paste(arriba_results$X.gene1[i], arriba_results$gene2[i], sep = "__")), "_", arriba_results$breakpoint1[i], "-", arriba_results$breakpoint2[i], ".png"))
    fusion <- pdf_render_page(arriba_plots.pdf, page = i, dpi = 300, numeric = TRUE, opw = "", upw = "")
    writePNG(fusion, arriba_plots.png)
  }

  ##### Clean the space
  rm(arriba_plots.pdf, arriba_plots.png, fusion)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Generate table with coloured cells indicating expression values for selected genes
exprTable <- function(genes, keep_all = FALSE, data, cn_data = NULL, sv_data = NULL, cn_decrease = TRUE, targets, sampleName, int_cancer, ext_cancer, comp_cancer, add_cancer = NULL, genes_annot = NULL, oncokb_annot = NULL, cancer_genes = NULL, mut_annot = NULL, fusion_genes = NULL, ext_links = FALSE, type = "z", scaling = "gene-wise") {
  
  ##### Check which of the selected genes are not present in the expression data
  genes.absent <- genes[ genes %!in% rownames(data) ]
    
  ##### Initiate dataframe for expression median values in each group
  targets.list <- unique(targets$Target)
  group.z <- as.data.frame(matrix(NA, ncol = length(targets.list), nrow = nrow(data)))
  colnames(group.z) <- targets.list
  rownames(group.z) <- rownames(data)
    
  ##### Perform scaling gene-wise
  if ( scaling == "gene-wise" ) {
    ##### Calculate z-score for each group  
    group.stats <- exprGroupsStats_geneWise(data, targets)[[1]]
    
    ##### Make sure to include only genes for which Z-scores were calaculated  (genes with SD = 0 across all samples will give NA)
    group.z <- group.z[ rownames(group.z) %in% rownames(group.stats[[targets.list[1]]]), ]
    
    #### Present expression data as percentiles or z-score values (default)
    for ( group in targets.list ) {
      if ( type == "perc" ) {
        group.z[, group] <- round(group.stats[[ group ]]$quantile, digits=1)
      } else {
        group.z[, group] <- round(group.stats[[ group ]]$z, digits=2)
      }
    }
    
  ##### Perform scaling group-wise
  } else {
    for ( group in targets.list ) {
      
      ##### Calculate z-score for each group  
      group.stats <- exprGroupStats_groupWise(data[rownames(group.z), ], targets, group)
      group.stats <- group.stats[order(rownames(group.stats)), ]
      
      #### Present expression data as percentiles or z-score values (default)
      if ( type == "perc" ) {
        group.z[, group] <- round(group.stats$quantile, digits=1)
      } else {
        group.z[, group] <- round(group.stats$z, digits=2)
      }
    } 
  }
  
  ##### If additional cancer type is defined then remove it from the data
  if ( !is.null(add_cancer) ) {
    group.z <- group.z[ , names(group.z) %!in% add_cancer ]
    targets <- targets[ targets$Target %!in% add_cancer, ]
    targets.list <- targets.list[ targets.list %!in% add_cancer ]
  }
  
  ##### Compute Z-scores sd for each gene across groups
  group.z <- cbind(group.z, round(rowSds(as.matrix(group.z)), digits = 2))
  names(group.z)[ncol(group.z)] <- "SD"
  
  ##### Calculate Z-score differneces between investigated sample and median values in the cancer group of interest
  group.z <- cbind(group.z, round((group.z[, sampleName] - group.z[, comp_cancer]), digits = 2))
  names(group.z)[ncol(group.z)] <- "Diff"
  
  ##### Add NAs for genes that are absent in the expression matrix. In the "Patient vs [comp_cancer]" columns provide "0"s to facilitate interactive sorting the table. These will appear in blank cells in the table
  if ( length(genes.absent) > 0 ) {
    
    NAs.df <- data.frame(matrix(NA, ncol = ncol(group.z), nrow = length(genes.absent)))
    names(NAs.df) <- names(group.z)
    rownames(NAs.df) <- genes.absent
    NAs.df[ names(NAs.df) %in% "Diff" ] <- 0
    group.z <- rbind( group.z,  NAs.df)
  }
  
  ##### Change sample ID to "Patient" for better visualisation
  names(group.z)[names(group.z)==sampleName] <- "Patient"
  targets.list[targets.list==sampleName] <- "Patient"
  
  ##### Reorder groups
  group.z <- cbind(group.z[ , c(ext_cancer, int_cancer, "Patient")], group.z[, c("SD", "Diff" )])
  
  ##### Add "Gene" column to facilitate adding annotations
  group.z$Gene <- rownames(group.z)
  
  ##### Add genes annotation
  if ( !is.null(genes_annot) ) {
    ##### Remove rows with duplicated gene symbols
    if ( "SYMBOL" %in% names(genes_annot) ) {
      genes_annot <- genes_annot[!duplicated(genes_annot$SYMBOL),]  
    }
    
    ##### Merge the dataframe with groups median expression values and gene annotations
    group.z <- merge(genes_annot, group.z, by.x="SYMBOL", by.y="Gene", all = TRUE, sort = FALSE)
    names(group.z) <- gsub("SYMBOL", "Gene", names(group.z))
  }
  
  ##### Define colours for cells background for each group and the patient vs [comp_cancer] difference
  ##### Initiate dataframe for expression median values in each group
  brks.q <- as.data.frame( matrix(NA, ncol = length(targets.list), nrow = length(seq(.05, .95, .0005)) ))
  colnames(brks.q) <- targets.list
  clrs.q <- as.data.frame( matrix(NA, ncol = length(targets.list), nrow = length(seq(.05, .95, .0005))+1 ))
  colnames(clrs.q) <- targets.list
  
  for ( group in c(targets.list, "Diff") ) {
    brks.q[[group]] <- quantile(group.z[, group], probs = seq(.05, .95, .0005), na.rm = TRUE)
    
    clrs_pos.q <- round(seq(255, 150, length.out = length(brks.q[[group]])/2 + 1.5), 0) %>%
    {paste0("rgb(255,", ., ",", ., ")")}
    clrs_neg.q <- rev(round(seq(255, 150, length.out = length(brks.q[[group]])/2 - 0.5), 0)) %>%
    {paste0("rgb(", .,",", .,",", "255)")}
    clrs.q[[group]] <- c(clrs_neg.q, clrs_pos.q)
  }
  
  ##### Subset the expression data to include only the user-defined genes
  group.z <- group.z[ group.z$Gene %in% genes, ]
    
  #### Add variants information to the expression table - if exists. Note, "TIER" and "CONSEQUENCE" columns are required
  if( !is.null(mut_annot) && "TIER" %in% colnames(mut_annot) && length(genes) > 0 ) {
    mut_annot <- mut_annot[mut_annot$SYMBOL %in% genes,]
    
    #### keep only varaints that has the lowest tier value. Multiple varaints detected in same gene but with higher tier will be added to additional column "CONSEQUENCE_OTHER". Applies to the ones that may have multiple mutations and hence tiers
    ##### First, create a list of genes to store multiple variants
    mut_consequence <- vector("list", length(unique(mut_annot$SYMBOL)))
    mut_consequence  <- setNames(mut_consequence,  unique(mut_annot$SYMBOL) )
    
    ##### Record all varaints detected in individual genes
    if ( nrow(mut_annot) > 0 ) {
      for ( i in 1:nrow(mut_annot) ) {
        mut_consequence[[ mut_annot$SYMBOL[i] ]] <- unique(c( mut_consequence[[ mut_annot$SYMBOL[i] ]], mut_annot$CONSEQUENCE[i] ))
      }
      
      mut_annot$CONSEQUENCE_OTHER <- "-"
    }
    
    ##### Remove the first elements since these variant consequences will be reported as the "canonical" CONSEQUENCE
    mut_consequence <- lapply(mut_consequence, function(x) x[-1])
    
    ##### Order variant entires based on tier info, to make sure that the varaints with the lowest tier are reported first
    mut_annot <- mut_annot[ order(mut_annot$TIER), ]
    
    ##### Remove rows with duplicated gene symbols
    mut_annot <- mut_annot[!duplicated(mut_annot$SYMBOL),]  
    rownames(mut_annot) <- mut_annot$SYMBOL
    
    ##### Add other provided variants consequences for individual genes
    for ( gene in rownames(mut_annot) ) {
      if ( length(mut_consequence[[ gene ]]) > 0 ) {
        mut_annot$CONSEQUENCE_OTHER[ match(gene, mut_annot$SYMBOL)  ] <- mut_consequence[[ gene ]]
      }
    }
    
    #### merge the variants information with the dataframe
    group.z <- merge(group.z, mut_annot, by.x = "Gene", by.y = "SYMBOL", all = TRUE, sort = FALSE)
  }
  
  ##### Add CN data if provided
  if ( !is.null(cn_data) ) {
    ##### Get the position of "Diff" column
    col_idx <- grep("Diff", names(group.z), fixed = TRUE)
    
    ##### Now place the CN data after the "Diff" column
    if ( length(genes) > 0 ) {
      group.z <- add_column(group.z, round(cn_data[ group.z$Gene, "CN"], digits=2), .after = col_idx)
      colnames(group.z)[ col_idx+1 ] <- "Patient (CN)"
      cn_range <- base::range(group.z[ ,"Patient (CN)" ], na.rm = TRUE)
      
    } else {
      group.z <- add_column(group.z, "", .after = col_idx)
      colnames(group.z)[ col_idx+1 ] <- "Patient (CN)"
      cn_range <- 0
    }
  }

  ##### Add structural variants results from MANTA
  if ( !is.null(sv_data) && length(genes) > 0 ) {
    ##### NOTE: when merging per-gene exprssion data with SV data from MANTA the "gene" column is used since multiple entires are possible for one gene in MANTA output
    group.z <- merge(group.z, sv_data, by.x="Gene", by.y="Gene", all = TRUE, sort = FALSE)
  }
  
  ##### Add info about known fusion genes
  if ( !is.null(fusion_genes) && length(genes) > 0 ) {
    
    group.z$Fusion_gene <- NA
    group.z$Fusion_gene[ group.z$Gene %in% fusion_genes  ] <- "Yes"
  }
  
  ##### Add cancer gene resources info
  if ( !is.null(cancer_genes) && length(genes) > 0 ) {
    group.z <- merge(group.z, cancer_genes, by.x="Gene", by.y="row.names", all = TRUE, sort = FALSE)
  }
  
  ##### Include only queried genes
  group.z <- group.z[ group.z$Gene %in% genes, ]
  group.z$SYMBOL <- group.z$Gene
  
  ##### Add links to external gene annotation resourses
  if ( ext_links && length(genes) > 0 ) {
    
    ##### Place the external links after the "Diff" column
    ##### Get the position of "Diff" column
    col_idx <- grep("Diff", names(group.z), fixed = TRUE)
    group.z <- add_column(group.z, NA, .after = col_idx)
    names(group.z)[ col_idx+1 ] <- "ext_links"
    
    for ( gene in genes ) {
      ##### Provide link to VICC meta-knowledgebase ( https://search.cancervariants.org )
      group.z$ext_links[ group.z$Gene==gene ] <- paste0("<a href='https://search.cancervariants.org/#", gene, "' target='_blank'>VICC</a>")
      
      ##### Provide link to OncoKB
      if ( !is.null(oncokb_annot) ) {
        if ( gene %in% rownames(oncokb_annot) & oncokb_annot[gene, "OncoKB"] == "Yes" ) {
          group.z$ext_links[ group.z$Gene == gene ] <- paste( group.z$ext_links[ group.z$Gene==gene ] , paste0("<a href='http://oncokb.org/#/gene/", gene, "' target='_blank'>OncoKB</a>"), sep = ", ")
        }
      }
      
      ##### Provide link to CIViC database druggable genes ( https://civicdb.org )
      if ( gene %in% caner_genes_annot.list[["civic_clin_evid"]]$gene ) {
        group.z$ext_links[ group.z$Gene==gene ] <- paste( group.z$ext_links[ group.z$Gene==gene ] , paste0("<a href='", unique(caner_genes_annot.list[["civic_clin_evid"]][ caner_genes_annot.list[["civic_clin_evid"]]$gene == gene , "gene_civic_url"]), "' target='_blank'>CIViC</a>"), sep = ", ")
      }
    }
    
    names(group.z) <- gsub("ext_links", "External resources", names(group.z))
  }
  
  ##### Attach links to GeneCards and Ensembl (if provided). Here we assume that gene names are
  for ( gene in genes ) {
    if ( "ENSEMBL" %in% names(group.z) ) {
        if ( !is.na(group.z$ENSEMBL[ group.z$Gene==gene ]) ) {
          
          group.z$ENSEMBL[ group.z$Gene==gene ] <- paste0("<a href='http://ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=", group.z$ENSEMBL[ group.z$Gene==gene], "' target='_blank'>", group.z$ENSEMBL[ group.z$Gene == gene ], "</a>")
      }
    }
    
    group.z$Gene[ group.z$Gene==gene ] <- paste0("<a href='https://www.genecards.org/cgi-bin/carddisp.pl?gene=", gene, "' target='_blank'>", gene, "</a>")
  }

  ##### Order the data by CN values (to allow filtering based on CN information) and then by the highest absolute values for Patient vs [comp_cancer] difference (to allow filtering based on z-score differences)
  if ( !is.null(cn_data) && length(genes) > 0 ) {
    ##### Get the position of "Patient (CN)" column
    col_idx <- grep("Patient (CN)", names(group.z), fixed = TRUE)
    group.z <- group.z[ order(abs(group.z[, "Diff"]),  decreasing = TRUE), ]
    group.z <- group.z[ order(group.z[ ,col_idx ],  decreasing = cn_decrease), ]
    
  ##### Order the data by increasing TIER category (to allow filtering based on tier information) and then by the highest absolute values for "Diff" difference (to allow filtering based on z-score differences)
  } else if  ( !is.null(mut_annot) && length(genes) > 0 ) {
    group.z <- group.z[ order(abs(group.z[, "Diff"]),  decreasing = TRUE), ]
    group.z <- group.z[ order(group.z$TIER), ]
    
  ##### Order the data by MANTA increasing Tier (to prioritise SVs, based on https://github.com/AstraZeneca-NGS/simple_sv_annotation/blob/master/simple_sv_annotation.py), event type and then by the highest absolute values for Patient vs [comp_cancer] difference
  } else if  ( !is.null(sv_data) && length(genes) > 0 ) {
    group.z <- group.z[ order(abs(group.z[, "Diff"]),  decreasing = TRUE), ]
    group.z <- group.z[ order(group.z$"Fusion genes",  decreasing = TRUE), ]
    group.z <- group.z[ order(group.z$Tier), ]
    
  ##### Otherwise order table by the highest absolute values for Patient vs [comp_cancer] difference
  } else if ( length(genes) > 0 ) {
    group.z <- group.z[ order(abs(group.z[, "Diff"]),  decreasing = TRUE), ]
  }
  
  ##### Remove the internal reference cohort column if the patient samples origins from other tissue. Of note, the internal reference cohort was only used to process the in-house data (including the investigated patient sample) and to correct batch-effects
  if ( comp_cancer != int_cancer ) {
      group.z <- group.z[ , names(group.z) %!in% int_cancer ]
      targets.list[ match(int_cancer, targets.list) ] <- "Patient"
      
      ##### Get the position of "Diff" column
      diff_col_idx <- grep("Diff", names(group.z), fixed = TRUE)
      
  } else {
      ##### Get the position of "Diff" column
      diff_col_idx <- grep("Diff", names(group.z), fixed = TRUE)
      names(group.z)[ match("Diff", names(group.z)) ] <- paste0("Patient vs ", comp_cancer)
  }
  
  ##### Limit the ordered table to maximum of 2000 entries if "keep_all" is set to FALSE (default)
  if ( nrow(group.z) > 2000 && !keep_all ) {
    group.z <- group.z[ 1:2000, ]
  }
  
  ##### Define table height
  if ( nrow(group.z) == 2 ) {
    table_height <- 230
    scrollY <- "67px"
  } else {
    scrollY <- "167px"
    table_height <- 318
  }
  
  ##### Generate a table with genes annotations and coloured expression values in each group
  if ( !is.null(cn_data) ) {
    dt.table <- DT::datatable( data = group.z[, names(group.z) %!in% c("SYMBOL", "SD")], filter="none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, scrollCollapse = TRUE, deferRender = TRUE, scrollY = scrollY, scroller = TRUE), width = 800, height = table_height, caption = htmltools::tags$caption( style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) %>%
      DT::formatStyle( columns = names(group.z)[names(group.z) %!in% c("SYMBOL", "SD")], `font-size` = '12px', 'text-align' = 'center' ) %>%
      
      ##### Colour cells according to the expression values quantiles in each group
      DT::formatStyle(columns = targets.list[1], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[1]]], clrs.q[[targets.list[1]]])) %>%
      DT::formatStyle(columns = targets.list[2], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[2]]], clrs.q[[targets.list[2]]])) %>%
      DT::formatStyle(columns = targets.list[3], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[3]]], clrs.q[[targets.list[3]]])) %>%
      DT::formatStyle(columns = names(group.z)[diff_col_idx], 
                      backgroundColor = DT::styleInterval(brks.q[["Diff"]], clrs.q[["Diff"]])) %>%
      DT::formatStyle(columns = "Patient (CN)", background = DT::styleColorBar(cn_range, 'lightblue'), backgroundSize = '98% 88%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center')
    
  ##### Generate a table with genes annotations and coloured expression values in each group
  } else {
    dt.table <- DT::datatable( data = group.z[, names(group.z) %!in% c("SYMBOL", "SD")], filter="none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, scrollCollapse = TRUE, deferRender = TRUE, scrollY = scrollY, scroller = TRUE), width = 800, height = table_height, caption = htmltools::tags$caption( style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) %>%
      DT::formatStyle( columns = names(group.z)[names(group.z) %!in% c("SYMBOL", "SD")], `font-size` = '12px', 'text-align' = 'center' ) %>%
      
      ##### Colour cells according to the expression values quantiles in each group
      DT::formatStyle(columns = targets.list[1], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[1]]], clrs.q[[targets.list[1]]])) %>%
      DT::formatStyle(columns = targets.list[2], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[2]]], clrs.q[[targets.list[2]]])) %>%
      DT::formatStyle(columns = targets.list[3], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[3]]], clrs.q[[targets.list[3]]])) %>%
      DT::formatStyle(columns = names(group.z)[diff_col_idx], 
                      backgroundColor = DT::styleInterval(brks.q[["Diff"]], clrs.q[["Diff"]]))
  }
  
  ##### Clean the space and return output
  rm(genes, data, cn_data, sv_data, targets, sampleName, genes_annot, oncokb_annot, cancer_genes, mut_annot, fusion_genes, genes.absent, targets.list, group.stats, brks.q, clrs.q)
  
  return( list(dt.table,  group.z) )
}

##### Generate table with drugs targeting selected set of genes using info from CIViC database (https://civicdb.org/)
civicDrugTable <- function(genes, civic_var_summaries, civic_clin_evid, evid_type = "Predictive", var_type = NULL) {
  
  ##### Initialize data frame to the about drug-target info from CIViC
  drug.info <- setNames(data.frame(matrix(ncol = 18, nrow = 0)), c("Gene", "Variant", "variant_types", "drugs", "nct_ids", "evidence_level", "evidence_type", "evidence_direction", "clinical_significance", "rating", "civic_actionability_score", "Disease", "phenotypes", "pubmed_id", "variant_origin", "representative_transcript", "representative_transcript2", "last_review_date"))
  
  evid_levels <- list("A" = "A: Validated association", "B" = "B: Clinical evidence", "C" = "C: Case study", "D" = "D: Preclinical evidence", "E" = "E: Inferential association")
  
  ##### Loop thourgh each gene and check if they are druggable
  for ( gene in genes) {
    ##### Get summary info about druggable genes
    if ( gene %in% civic_clin_evid$gene ) {
      ##### Extract info about all reported variants's clinical evidence for queried gene
      clin.evid.info <- civic_clin_evid[ civic_clin_evid$gene == gene , ]

      ##### Use more descriptive evidence level info
      for ( level in unique(clin.evid.info$evidence_level) ) {
        clin.evid.info$evidence_level[ clin.evid.info$evidence_level == level ] <- evid_levels[[ level ]]
      }
      
      ##### Subset table to include only variants with the evidence type of interest
      clin.evid.info <- clin.evid.info[ clin.evid.info$evidence_type == evid_type,  ]
        
      if ( nrow(clin.evid.info) > 0 ) {
        ##### Provide link to CIViC clinical evidence summary
        clin.evid.info$drugs <- paste0("<a href='", clin.evid.info$evidence_civic_url, "' target='_blank'>", clin.evid.info$drugs, "</a>")
        
        ##### Provide link to CIViC clinical evidence summary
        clin.evid.info$evidence_type <- paste0("<a href='", clin.evid.info$evidence_civic_url, "' target='_blank'>", clin.evid.info$evidence_type, "</a>")
        
        ##### Provide link to CIViC gene summary
        clin.evid.info$gene_civic_url <- paste0("<a href='", clin.evid.info$gene_civic_url, "' target='_blank'>", gene, "</a>")
        names(clin.evid.info)[ names(clin.evid.info) =="gene_civic_url" ] <- "Gene"
        
        ##### Provide link to CIViC variants summary
        clin.evid.info$variant_civic_url <- paste0("<a href='", clin.evid.info$variant_civic_url, "' target='_blank'>", clin.evid.info$variant, "</a>")
        names(clin.evid.info)[ names(clin.evid.info) =="variant_civic_url" ] <- "Variant"
        
        ##### Provide link to ClinicalTrials.gov variants summary based on NCT IDs
        for ( nct_id in clin.evid.info$nct_ids ) {
          if ( !is.empty(nct_id) ) {
            
            ##### Deal with multiple NCT IDs (separated by comma)
            nct_id_url <- gsub(" '" , "'", paste(gsub("/ " , "/", paste("<a href='https://clinicaltrials.gov/ct2/show/", unlist(strsplit(nct_id, split=",", fixed=TRUE)) , "' target='_blank'>", unlist(strsplit(nct_id, split=",", fixed=TRUE)), "</a>")), collapse = ", "))
            clin.evid.info$nct_ids[ clin.evid.info$nct_ids==nct_id ] <- nct_id_url
          }
        }
        
        ##### Provide link to PubMed variants summary
        clin.evid.info$pubmed_id <- paste0("<a href='https://www.ncbi.nlm.nih.gov/pubmed/", clin.evid.info$pubmed_id, "' target='_blank'>", clin.evid.info$pubmed_id, "</a>")
        
        ##### Provide link to Disease Ontology
        clin.evid.info$doid <- paste0("<a href='http://www.disease-ontology.org/?id=DOID:", clin.evid.info$doid, "' target='_blank'>", clin.evid.info$disease, "</a>")
        names(clin.evid.info)[ names(clin.evid.info) =="doid" ] <- "Disease"
        
        ##### Extract info about all variants it that gene
        var.info <- civic_var_summaries[ civic_var_summaries$gene == gene , ]
        var.info <- var.info[, c("variant", "variant_types", "civic_actionability_score")]
        var.info[,"variant_types"] <- gsub("_", " ", var.info[,"variant_types"])
        var.info[,"variant_types"] <- gsub(",", ", ", var.info[,"variant_types"])
        
        ##### Merge about all variants it that gene and clinical evidence info
        clin.evid.info <- merge(clin.evid.info, var.info, by = "variant", all.x = TRUE)
        
        ##### Filter drug matching info depending on the variant type
        var_type.keep <- NULL
        
        ##### Remove entries containing "EXPRESSION", "AMPLIFICATION", "DELETION", "METHYLATION", "WILD TYPE", "FUSION", "COPY", "REARRANGEMENT", "PHOSPHORYLATION", "TRANSCRIPT", "GAIN", "LOSS"
        if ( !is.null(var_type) && var_type == "mutation" ) {
          var_type.keep <- c(var_type.keep, grep( "EXPRESSION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "AMPLIFICATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "DELETION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "METHYLATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "WILD TYPE", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "FUSION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "REARRANGEMENT", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "PHOSPHORYLATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "COPY", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "TRANSCRIPT", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "GAIN", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "LOSS", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          
          clin.evid.info <- clin.evid.info[ -c(unique(var_type.keep)), ]
          
        ##### Keep only entries containing "EXPRESSION", "FUSION", "TRANSCRIPT", "ALTERATION"
        } else if ( !is.null(var_type) && var_type == "expression" ) {
          var_type.keep <- c(var_type.keep, grep( "EXPRESSION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "FUSION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "TRANSCRIPT", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "ALTERATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          
          clin.evid.info <- clin.evid.info[ c(unique(var_type.keep)), ]
          
        ##### Keep only entries containing "FUSION", "ALTERATION", "[gene]-", "-[gene]"
        } else if ( !is.null(var_type) && var_type == "fusion" ) {
          var_type.keep <- c(var_type.keep, grep( "FUSION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( paste0(gene, "-"), clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( paste0("-", gene), clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "ALTERATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          
          clin.evid.info <- clin.evid.info[ c(unique(var_type.keep)), ]
        
        ##### Keep only entries containing "AMPLIFICATION", "COPY", "GAIN", "ALTERATION"
        } else if ( !is.null(var_type) && var_type == "copy_gain" ) {
          var_type.keep <- c(var_type.keep, grep( "AMPLIFICATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "COPY", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "GAIN", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "ALTERATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          
          clin.evid.info <- clin.evid.info[ c(unique(var_type.keep)), ]
        
        ##### Keep only entries containing "DELETION", "COPY", "LOSS", "ALTERATION"
        } else if ( !is.null(var_type) && var_type == "copy_loss" ) {
          var_type.keep <- c(var_type.keep, grep( "DELETION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "COPY", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "LOSS", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "ALTERATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          
          clin.evid.info <- clin.evid.info[ c(unique(var_type.keep)), ]
        }
      }
      
      if ( nrow(clin.evid.info) > 0 ) {
        ##### Subset table to include only most important info
        clin.evid.info <- clin.evid.info[ , names(drug.info)]
        
        ##### Add drugs info for subsequent gene
        drug.info <- rbind(drug.info, clin.evid.info)
      }
    }
  }
  
  ##### Use more friendly column names for the table
  names(drug.info) <- c("Gene", "Variant", "Variant type", "Drugs", "Clinical trials", "Evidence level", "Evidence type", "Evidence direction", "Clinical significance", "Trust rating", "Actionability score", "Disease", "Phenotypes", "PubMed ID",  "Variant origin", "Representative transcript", "Representative transcript 2", "Review date")
  
  ##### Limit the info to fewer columns
  drug.info <- drug.info[ , c("Gene", "Variant", "Variant type", "Drugs", "Clinical trials", "Evidence level", "Evidence direction", "Clinical significance", "Trust rating", "Actionability score", "Disease", "Phenotypes", "PubMed ID",  "Representative transcript", "Representative transcript 2")] 
  
  ##### Generate a table
  dt.table <- DT::datatable( data = drug.info, filter = "none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = "167px", scroller = TRUE), width = 800, caption = htmltools::tags$caption(style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) %>%
    DT::formatStyle( columns = names(drug.info), `font-size` = '12px', 'text-align' = 'center' ) %>%
    ##### Colour cells according to evidence level and trust rating
    DT::formatStyle(columns = "Evidence level", 
                    backgroundColor = DT::styleEqual(c("A: Validated association", "B: Clinical evidence", "C: Case study", "D: Preclinical evidence", "E: Inferential association"), c("mediumseagreen", "deepskyblue", "mediumpurple", "darkorange", "coral")) )  %>%
    DT::formatStyle(columns = "Trust rating", 
                    backgroundColor = DT::styleEqual(c(1:5), c("coral", "azure", "lightskyblue", "palegreen", "mediumseagreen")) )
  
  ##### Clean the space and return output
  rm(genes, civic_var_summaries, civic_clin_evid, evid_levels, clin.evid.info, var.info, var_type.keep)
  return( list(dt.table,  drug.info) )
}

##### Code from UMCCRISE to prioritise SV events (version for "-sv-prioritize-manta-pass.tsv" files, https://github.com/umccr/umccrise/blob/master/umccrise/rmd_files/index.Rmd)
sv_prioritize_short <- function(sv_file) {
  
  sv_all = NULL
  
  if (length(readLines(con = sv_file, n = 2)) > 1) {
    sv_all <- readr::read_tsv(sv_file, col_names = TRUE) %>%
      tidyr::unnest(annotation = strsplit(annotation, ',')) %>% # Unpack multiple annotations per region
      tidyr::separate(annotation,
                      c('Event', 'Annotation', 'Gene', 'Transcript', 'Priority', 'Tier'),
                      sep = '\\|', convert = TRUE) %>% # Unpack annotation columns %>%
      dplyr::mutate(start = format(start, big.mark = ',', trim = T),
                    end = format(end, big.mark = ',', trim = T)) %>% 
      dplyr::mutate(Location = str_c(chrom, ':', start, sep = ''),
                    Location = ifelse(is.na(end), Location, str_c(Location))) %>%
      dplyr::mutate(SR = split_read_support, PR = paired_support_PR) %>%
      dplyr::select(Location, Gene, Priority, Tier, Annotation, Event, SR, PR) %>%
      dplyr::distinct()
      # dplyr::mutate(Chrom = factor(Chrom, levels = c(1:22, "X", "Y", "MT")))
  } else {
    warning('No prioritized events detected')
  }
  return( sv_all )
}

##### Code from UMCCRISE to prioritise SV events (version for "-manta.tsv" files https://github.com/umccr/umccrise/blob/master/umccrise/rmd_files/index.Rmd
sv_prioritize <- function(sv_file) {
  
  sv_all = NULL

  if (length(readLines(con = sv_file, n = 2)) > 1) {
    
    ##### Due to changes in PURPLE output format there are two expected column names combinations
    if ( all(c("AF_BPI", "AF_PURPLE", "CN_PURPLE", "CN_change_PURPLE", "Ploidy_PURPLE") %in% names(read_tsv(sv_file, col_names = TRUE))) ) {
    
      sv_all <- readr::read_tsv(sv_file, col_names = TRUE) %>%
        dplyr::select(-caller, -sample) %>% 
        split_sv_field(AF_BPI, is_pct = T) %>% 
        split_sv_field(AF_PURPLE, is_pct = T) %>% 
        split_sv_field(CN_PURPLE) %>% 
        split_sv_field(CN_change_PURPLE) %>% 
        dplyr::mutate(
          Ploidy_PURPLE = as.double(Ploidy_PURPLE),
          Ploidy_PURPLE = format(Ploidy_PURPLE, nsmall = 2)
        ) %>% 
        tidyr::separate(split_read_support, c("SR (ref)", "SR (alt)"), ",") %>% 
        dplyr::mutate(SR = as.integer(`SR (alt)`)) %>% 
        tidyr::separate(paired_support_PR, c("PR (ref)", "PR (alt)"), ",") %>% 
        dplyr::mutate(PR = as.integer(`PR (alt)`)) %>% 
        tidyr::separate(paired_support_PE, c("PE (ref)", "PE (alt)"), ",") %>% 
        dplyr::mutate(PE = as.integer(`PE (alt)`)) %>% 
        
        dplyr::filter(svtype != 'BND' | is.na(SR) | PR>SR) %>%  # remove BND with split read support higher than paired
        tidyr::unnest(annotation = strsplit(annotation, ',')) %>%  # Unpack multiple annotations per region
        tidyr::separate(annotation,
                        c('Event', 'Effect', 'Genes', 'Transcript', 'Detail', 'Tier'),
                        sep = '\\|', convert = TRUE) %>%  # Unpack annotation columns
        dplyr::mutate(start = format(start, big.mark = ',', trim = T),
                      end = format(end, big.mark = ',', trim = T)) %>% 
        dplyr::mutate(location = str_c(chrom, ':', start, sep = ''),
                      location = ifelse(is.na(end), location, str_c(location))) %>% 
        dplyr::arrange(Tier, Effect, desc(AF_PURPLE), Genes) %>% 
        dplyr::mutate(Gene = subset_genes(Genes, c(1, 2)),
                      Gene = ifelse((str_split(Genes, '&') %>% map_int(length)) > 2,
                                    str_c(Gene, '...', sep = ', '),
                                    Gene),
                      `Other affected genes` = subset_genes(Genes, -c(1,2)) %>% str_replace_all('&', ', '),
                      Gene = ifelse(str_detect(Effect, "gene_fusion"),
                                    Gene,
                                    Gene %>% str_replace_all('&', ', '))
                      ) %>% 
        separate(Effect, c("Effect", "Other effects"), sep = '&') %>% 
        dplyr::select(Tier = tier, Event = svtype, Gene, Effect = Effect, Detail = Detail, Location = location, AF = AF_PURPLE, `CN chg` = CN_change_PURPLE, SR, PR, CN = CN_PURPLE, Ploidy = Ploidy_PURPLE, PURPLE_status, `SR (ref)`, `PR (ref)`, PE, `PE (ref)`, `Somatic score` = somaticscore, Transcript = Transcript, `Other effects`, `Other affected genes`, `AF at breakpoint 1` = AF_PURPLE1, `AF at breakpoint 2` = AF_PURPLE2, `CN at breakpoint 1` = CN_PURPLE1, `CN at breakpoint 2` = CN_PURPLE2, `CN change at breakpoint 1` = CN_change_PURPLE1, `CN change at breakpoint 2` = CN_change_PURPLE2, `AF before adjustment, bp 1` = AF_BPI1, `AF before adjustment, bp 2` = AF_BPI2
        ) %>%
        dplyr::distinct()
        # dplyr::mutate(chr = factor(chr, levels = c(1:22, "X", "Y", "MT"))) %>%
      
    } else {
         sv_all <- readr::read_tsv(sv_file, col_names = TRUE) %>%
        dplyr::select(-caller, -sample) %>% 
        split_sv_field(BPI_AF, is_pct = T) %>% 
        split_sv_field(AF, is_pct = T) %>% 
        split_sv_field(CN) %>% 
        split_sv_field(CN_change) %>% 
        dplyr::mutate(
          Ploidy = as.double(Ploidy),
          Ploidy = format(Ploidy, nsmall = 2)
        ) %>% 
        tidyr::separate(split_read_support, c("SR (ref)", "SR (alt)"), ",") %>% 
        dplyr::mutate(SR = as.integer(`SR (alt)`)) %>% 
        tidyr::separate(paired_support_PR, c("PR (ref)", "PR (alt)"), ",") %>% 
        dplyr::mutate(PR = as.integer(`PR (alt)`)) %>% 
        tidyr::separate(paired_support_PE, c("PE (ref)", "PE (alt)"), ",") %>% 
        dplyr::mutate(PE = as.integer(`PE (alt)`)) %>% 
        
        dplyr::filter(svtype != 'BND' | is.na(SR) | PR>SR) %>%  # remove BND with split read support higher than paired
        tidyr::unnest(annotation = strsplit(annotation, ',')) %>%  # Unpack multiple annotations per region
        tidyr::separate(annotation,
                        c('Event', 'Effect', 'Genes', 'Transcript', 'Detail', 'Tier'),
                        sep = '\\|', convert = TRUE) %>%  # Unpack annotation columns
        dplyr::mutate(start = format(start, big.mark = ',', trim = T),
                      end = format(end, big.mark = ',', trim = T)) %>% 
        dplyr::mutate(location = str_c(chrom, ':', start, sep = ''),
                      location = ifelse(is.na(end), location, str_c(location))) %>% 
        dplyr::arrange(Tier, Effect, desc(AF), Genes) %>% 
        dplyr::mutate(Gene = subset_genes(Genes, c(1, 2)),
                      Gene = ifelse((str_split(Genes, '&') %>% map_int(length)) > 2,
                                    str_c(Gene, '...', sep = ', '),
                                    Gene),
                      `Other affected genes` = subset_genes(Genes, -c(1,2)) %>% str_replace_all('&', ', '),
                      Gene = ifelse(str_detect(Effect, "gene_fusion"),
                                    Gene,
                                    Gene %>% str_replace_all('&', ', '))
                      ) %>% 
        separate(Effect, c("Effect", "Other effects"), sep = '&') %>% 
        dplyr::select(Tier = tier, Event = svtype, Gene, Effect = Effect, Detail = Detail, Location = location, AF, `CN chg` = CN_change, SR, PR, CN, Ploidy, PURPLE_status, `SR (ref)`, `PR (ref)`, PE, `PE (ref)`, `Somatic score` = somaticscore, Transcript = Transcript, `Other effects`, `Other affected genes`, `AF at breakpoint 1` = AF1, `AF at breakpoint 2` = AF2, `CN at breakpoint 1` = CN1, `CN at breakpoint 2` = CN2, `CN change at breakpoint 1` = CN_change1, `CN change at breakpoint 2` = CN_change2, `AF before adjustment, bp 1` = BPI_AF1, `AF before adjustment, bp 2` = BPI_AF2
        ) %>%
        dplyr::distinct()
        # dplyr::mutate(chr = factor(chr, levels = c(1:22, "X", "Y", "MT"))) %>%
    }
  } else {
    warning('No prioritized events detected')
  }
  return( sv_all )
}

##### Function used in the "sv_prioritize" function
subset_genes = function(genes, ind) {
  genes %>% str_split('&') %>% map(~ .[ind] %>% replace("", NA) %>% .[!is.na(.)]) %>% map_chr(~ ifelse(length(.) > 0, str_c(., collapse = '&'), ""))
}

##### Function used in the "sv_prioritize" function
format_val = function(val, is_pct = F) {
  ifelse(!is.na(val), 
         format(val,  digits = 1) %>% str_c(ifelse(is_pct, "%", "")), NA)
}

##### Function used in the "sv_prioritize" function 
split_sv_field = function(.data, field, is_pct = F) {
  f_q = rlang::enquo(field)
  f_str = rlang::quo_name(f_q)
  f1_str = str_c(f_str, '1')
  f2_str = str_c(f_str, '2')
  f1_q = sym(f1_str)
  f2_q = sym(f2_str)
  .data %>% 
    separate(!!f_q, c(f1_str, f2_str), ",") %>% 
    dplyr::mutate(
      !!f1_q := as.double(!!f1_q) * ifelse(is_pct, 100, 1),
      !!f2_q := as.double(!!f2_q) * ifelse(is_pct, 100, 1),
      !!f_q  := (!!f1_q + ifelse(is.na(!!f2_q), !!f1_q, !!f2_q)) / 2,
      !!f_q  := format_val(!!f_q, is_pct),
      !!f1_q := format_val(!!f1_q, is_pct),
      !!f2_q := format_val(!!f2_q, is_pct)
    )
}

CapStr <- function(y) {
  c <- strsplit(y, " ")[[1]]
  paste(toupper(substring(c, 1,1)), substring(c, 2),
      sep="", collapse=" ")
}

##### A wrapper to saveWidget which compensates for arguable BUG in saveWidget which requires `file` to be in current working directory (see post https://github.com/ramnathv/htmlwidgets/issues/299 )
saveWidgetFix <- function ( widget, file, ...) {
  wd<-getwd()
  on.exit(setwd(wd))
  outDir<-dirname(file)
  file<-basename(file)
  setwd(outDir);
  htmlwidgets::saveWidget(widget,file=file,...)
}

##### Define function for generating spider web plots to present immunogram genes (code from http://www.statisticstoproveanything.com/2013/11/spider-web-plots-in-r.html)
# data - data.frame or matrix
# data.row - row of data to plot (if NULL uses row 1)
# y.cols - columns of interest (if NULL it selects all numeric columns)
# main - title of plot (if NULL then rowname of data)
# add - whether the plot should be added to an existing plot
# col - color of the data line
# lty - lty of the data line

webplot = function(data, data.row = NULL, y.cols = NULL, main = NULL, add = F, 
    col = "red", lty = 1, scale = T) {
    if (!is.matrix(data) & !is.data.frame(data)) 
        stop("Requires matrix or data.frame")
    if (is.null(y.cols)) 
        y.cols = colnames(data)[sapply(data, is.numeric)]
    if (sum(!sapply(data[, y.cols], is.numeric)) > 0) {
        out = paste0("\"", colnames(data)[!sapply(data, is.numeric)], "\"", 
            collapse = ", ")
        stop(paste0("All y.cols must be numeric\n", out, " are not numeric"))
    }
    if (is.null(data.row)) 
        data.row = 1
    if (is.character(data.row)) 
        if (data.row %in% rownames(data)) {
            data.row = which(rownames(data) == data.row)
        } else {
            stop("Invalid value for data.row:\nMust be a valid rownames(data) or row-index value")
        }
    if (is.null(main)) 
        main = rownames(data)[data.row]
    if (scale == T) {
        data = scale(data[, y.cols])
        data = apply(data, 2, function(x) x/max(abs(x)))
    }
    data = as.data.frame(data)
    n.y = length(y.cols)
    min.rad = 360/n.y
    polar.vals = (90 + seq(0, 360, length.out = n.y + 1)) * pi/180

    if (add == F) {
        plot(0, xlim = c(-2.2, 2.2), ylim = c(-2.2, 2.2), type = "n", axes = F, 
            xlab = "", ylab = "")
        title(main)
        lapply(polar.vals, function(x) lines(c(0, 2 * cos(x)), c(0, 2 * sin(x))))
        lapply(1:n.y, function(x) text(2.15 * cos(polar.vals[x]), 2.15 * sin(polar.vals[x]), 
            y.cols[x], cex = 0.8))

        lapply(seq(0.5, 2, 0.5), function(x) lines(x * cos(seq(0, 2 * pi, length.out = 100)), 
            x * sin(seq(0, 2 * pi, length.out = 100)), lwd = 0.5, lty = 2, col = "gray60"))
        lines(cos(seq(0, 2 * pi, length.out = 100)), sin(seq(0, 2 * pi, length.out = 100)), 
            lwd = 1.2, col = "gray50")
    }

    r = 1 + data[data.row, y.cols]
    xs = r * cos(polar.vals)
    ys = r * sin(polar.vals)
    xs = c(xs, xs[1])
    ys = c(ys, ys[1])
    lines(xs, ys, col = col, lwd = 2, lty = lty)
    
    #### Clear plots to free up some memory
    if(!is.null(dev.list())) invisible(dev.off())
}
##### Generate a full-resolution pdf image before generating a small image in the chunk
knitr::knit_hooks$set(plot = allow_thumbnails)
##### Load libraries
suppressMessages(library(edgeR))
suppressMessages(library(limma))
suppressMessages(library(EDASeq))
suppressMessages(library(preprocessCore))
suppressMessages(library(rapportools))
suppressMessages(library(tximport))
suppressMessages(library(rhdf5))
suppressMessages(library(openxlsx))
suppressMessages(library(readr))
suppressMessages(library(tidyverse))
suppressMessages(library(dplyr))
suppressMessages(library(tidyr))
suppressMessages(library(rlang))
suppressMessages(library(DT))
suppressMessages(library(matrixStats))
suppressMessages(library(tibble))
suppressMessages(library(knitr))
suppressMessages(library(scales))
suppressMessages(library(RCircos))
suppressMessages(library(ggplot2))
suppressMessages(library(ggforce))
suppressMessages(library(pdftools))
suppressMessages(library(png))
suppressMessages(library(htmltools))
suppressMessages(library(htmlwidgets))
suppressMessages(library(devtools))
suppressMessages(library(lares))
suppressMessages(library(package=paste0("EnsDb.Hsapiens.v", params$ensembl_version), character.only = TRUE))
suppressMessages(library(package=paste0("BSgenome.Hsapiens.UCSC.hg", params$ucsc_genome_assembly), character.only = TRUE))
##### Define Z-transformation direction
if (tolower(params$scaling) == "gene-wise"){
  scaling <- "gene-wise"
} else {
  scaling <- "group-wise"
}
##### Annotate transcripts with gene IDs
edb <- eval(parse(text = paste0("EnsDb.Hsapiens.v", params$ensembl_version)))
  
##### Get keytypes for gene SYMBOL
keys <- keys(edb, keytype="GENEID")
  
##### Get genes genomic coordiantes
tx2ensembl <- ensembldb::select(edb, keys=keys, columns=c("TXID", "GENEID"), keytype="GENEID")
names(tx2ensembl) <- gsub("TXID", "tx_name", names(tx2ensembl))
names(tx2ensembl) <- gsub("GENEID", "gene_id", names(tx2ensembl))
  
##### Clean the space
rm(edb, keys)
##### Load reference datasets
##### Define the reference datasets based on user-defined input
dataset <- toupper(params$dataset)

ref_dataset <- list( "ext_ref" = c(paste0(params$ref_data_dir, "/ref_data/TCGA_", strsplit(dataset, split='-', fixed=TRUE)[[1]][1], "_Counts.exp.gz"), paste0(params$ref_data_dir, "/ref_data/TCGA_", dataset, "_Target.txt"), paste0(strsplit(dataset, split='-', fixed=TRUE)[[1]][1], " (TCGA)")),
                     "int_ref" = c(paste0(params$ref_data_dir, "/ref_data/UMCCR_PDAC_Counts.exp.gz"), paste0(params$ref_data_dir, "/ref_data/UMCCR_PDAC_Target.txt"), "PAAD (UMCCR)")
)

##### Create a list with reference datasets
ref_dataset.list <- vector("list", length(dataset))
names(ref_dataset.list) <- dataset

##### Create a list with various sets of genes
ref_genes <- c("genes_cancer", "genes_oncokb", "genes_immune", "genes_hrd")
ref_genes.list <- vector("list", length(ref_genes))
names(ref_genes.list) <- ref_genes

##### Create a list with cancer genes annotations
caner_genes_annot <- c("oncokb_clin_vars", "oncokb_all_vars")
caner_genes_annot.list <- vector("list", length(caner_genes_annot))
names(caner_genes_annot.list) <- caner_genes_annot

##### Get the subject ID
if ( !is.na(params$subject_id) ) {
  subjectID <- params$subject_id
} else {
  subjectID <- ""
}

if ( !is.null(params$bcbio_rnaseq) ) {
  
  ##### Get patient data dir and sample file name
  dataDir <- params$bcbio_rnaseq
  
  ##### Look at countsFromAbundance parameter to change the method to generate the counts
  txi.kallisto <- tximport(paste0(dataDir, "/kallisto/abundance.tsv"), type = "kallisto", tx2gene = tx2ensembl)
  
  ##### Extract kallisto counts to prepare dataframe
  counts <- as.data.frame(txi.kallisto$counts) %>%
    tibble::rownames_to_column() %>%
    dplyr::rename(count = V1)
  
} else if ( !is.null(params$dragen_rnaseq) ) {
  
  ##### Get patient data dir and sample file name
  dataDir <- paste(params$dragen_rnaseq, "dragen", sep = "/")
  
  ##### Look at countsFromAbundance parameter to change the method to generate the counts
  txi.salmon <- tximport(paste0(dataDir, "/", list.files(dataDir, pattern="\\.sf$")), type = "salmon", tx2gene = tx2ensembl)
  
  ##### Extract salmon counts to prepare dataframe
  counts <- as.data.frame(txi.salmon$counts) %>%
    tibble::rownames_to_column() %>%
    dplyr::rename(count = V1)
}

##### Create directory for results
results_dir <- paste0(params$report_dir, "/", params$sample_name, params$dataset_name_incl, ".results")

if ( !file.exists(results_dir) ) {
  dir.create(results_dir, recursive=TRUE)
}

##### Check if spreadsheet with clinical information exists
clinical_info_file <- params$clinical_info
runClinicalChunk <- FALSE

if ( file.exists(clinical_info_file) ) {
  ref_dataset.list[[dataset]][["clinical_info"]] <- read.xlsx(xlsxFile = clinical_info_file, sheet = 1, colNames = TRUE, rowNames = FALSE, detectDates = TRUE, skipEmptyRows = TRUE, skipEmptyCols = TRUE, check.names = TRUE)
  runClinicalChunk <- TRUE
}

##### Read in selected genes list
ref_genes.list[["genes_cancer"]] <- read.table(paste(params$ref_data_dir, params$genes_cancer, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="")
ref_genes.list[["genes_oncokb"]] <- read.table(paste(params$ref_data_dir, params$oncokb_genes, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="", comment.char = "")
ref_genes.list[["genes_immune"]]$immune_markers <- read.table(paste(params$ref_data_dir, params$genes_immune_markers, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="")
ref_genes.list[["genes_hrd"]] <- read.table(paste(params$ref_data_dir, params$genes_hrd, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="")

if ( params$immunogram ) {
  ref_genes.list[["genes_immune"]]$immunogram <- read.table(paste(params$ref_data_dir, params$genes_immunogram, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="")
}

##### Read in gene fusion data for investigate sample
##### Read in arriba and pizzly fusion calls
##### Check if arriba output file exists
arriba_file <- paste(dataDir, "arriba", "fusions.tsv", sep = "/")
arriba_pdf <- paste(dataDir, "arriba", "fusions.pdf", sep = "/")
runArribaChunk <- FALSE
runFusionChunk <- FALSE

if ( file.exists(arriba_file) ) {
  ref_genes.list[["arriba"]] <- read.table(file = arriba_file, header = TRUE, comment.char = "", quote = "")
  
  ##### Make sure that at least one fusions has been reported by Arriba
  if ( nrow(ref_genes.list[["arriba"]]) > 0 ) {
    
    ##### Convert Arriba pdf booklet with fusion plots to png images
    if ( file.exists(arriba_pdf) ) {
      arriba_plots(arriba_file = arriba_file, arriba_results = ref_genes.list[["arriba"]], results_dir = paste0(results_dir, "/arriba"))
    }
    
    ##### Write list of fusion events for which Arriba plot is available into a file (for PIEdb portal)
    fusion <- gsub(":", ".", c("", paste0(make.names(paste(ref_genes.list[["arriba"]]$X.gene1, ref_genes.list[["arriba"]]$gene2, sep = "__")), "_", ref_genes.list[["arriba"]]$breakpoint1, "-", ref_genes.list[["arriba"]]$breakpoint2)))
    
    write.table(prepare2write(fusion), file = paste0(results_dir, "/", params$sample_name, params$dataset_name_incl, ".RNAseq_report.arriba_fusions.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE, append = FALSE )
  
    runArribaChunk <- TRUE
    runFusionChunk <- TRUE
    
  } else {
    ##### Write list of fusion events for which arriba plot is available into a file (for PIEdb portal)
  write.table(prepare2write(""), file = paste0(results_dir, "/", params$sample_name, params$dataset_name_incl, ".RNAseq_report.arriba_fusions.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE, append = FALSE )
  }
  
} else {
  ##### Write list of fusion events for which arriba plot is available into a file (for PIEdb portal)
  write.table(prepare2write(""), file = paste0(results_dir, "/", params$sample_name, params$dataset_name_incl, ".RNAseq_report.arriba_fusions.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE, append = FALSE )
}

##### Read in dragen fusion calls
##### Check if dragen output file exists
dragen_fusion_file <- paste(dataDir, list.files(dataDir, pattern="\\.fusion_candidates.final$"), sep = "/")
runDragenFusionChunk <- FALSE

if ( !is.null(params$dragen_rnaseq) && file.exists(dragen_fusion_file) ) {
  
  ##### Dragen's fusion output file header starts with '#' hence change the comment indicator option to '^' ( https://stackoverflow.com/questions/27196470/reading-a-line-that-starts-with-a-hash-on-a-txt-file )
  
  dragen_fusion <- read.table(file = dragen_fusion_file[1], header = TRUE, comment.char = '^', quote = "")
  
  ##### Check Dragen's fusion format version
  #####  Dragen's fusion format version 3.9.3
  if ( all(c("X.FusionGene", "Score", "LeftBreakpoint", "RightBreakpoint", "Gene1Location", "Gene2Location", "Gene1Sense", "Gene2Sense", "Gene1Id", "Gene2Id", "NumSplitReads", "NumSoftClippedReads", "NumPairedReads", "ReadNames") %in% colnames(dragen_fusion)) ) {
    colnames(dragen_fusion) <- c("FusionGene", "Score", "LeftBreakpoint", "RightBreakpoint", "Gene1Location", "Gene2Location", "Gene1Sense", "Gene2Sense", "Gene1Id", "Gene2Id", "NumSplitReads", "NumSoftClippedReads", "NumPairedReads", "ReadNames")
  } else if ( all(c("X.FusionGene", "Score", "LeftBreakpoint", "RightBreakpoint", "ReadNames") %in% colnames(dragen_fusion)) ) {
    colnames(dragen_fusion) <- c("FusionGene", "Score", "LeftBreakpoint", "RightBreakpoint", "ReadNames")
  }
  
  dragen_fusion_genes <- dragen_fusion %>%
    tidyr::separate(col = FusionGene, into = c("gene1", "gene2"), sep = "--")
  
  ref_genes.list[["dragenFusion"]] <- dragen_fusion_genes
  
  runDragenFusionChunk <- TRUE
  runFusionChunk <- TRUE
}


##### Read in pizzly fusion calls
##### Check if pizzly output file exists
pizzly_file <- paste(dataDir, "pizzly", paste0(params$sample_name, "-flat.tsv"), sep = "/")
pizzly_file_filtered <- paste(dataDir, "pizzly", paste0(params$sample_name, "-flat-filtered.tsv"), sep = "/")
runPizzlyChunk <- FALSE

if ( !is.null(params$bcbio_rnaseq) &&  file.exists(pizzly_file) ) {
  ref_genes.list[["pizzly"]] <- read.table(file = pizzly_file, header = TRUE, quote = "")
  runPizzlyChunk <- TRUE
  runFusionChunk <- TRUE
} else if ( file.exists(pizzly_file_filtered) ) {
  ref_genes.list[["pizzly"]] <- read.table(file = pizzly_file_filtered, header = TRUE, quote = "")
  runPizzlyChunk <- TRUE
  runFusionChunk <- TRUE
}

##### Read in mutation data for investigate sample
##### Get the genomic output data from umccrise
if ( !is.null(params$umccrise) ) {
  umccrise <- unlist(strsplit(params$umccrise, split='/', fixed=TRUE))
  umccrise <- umccrise[length(umccrise)]
  
  ##### Check if PCGR (mutation) output file exists
  runPcgrChunk <- TRUE
  
  if ( file.exists(paste(params$umccrise, "small_variants", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")) ) {
    pcgr_file <- paste(params$umccrise, "small_variants", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")
  } else if ( file.exists(paste(params$umccrise, "..", "work", umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")) ) {
    pcgr_file <- paste(params$umccrise, "..", "work", umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")
  } else if ( file.exists(paste(params$umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")) ) {
    pcgr_file <- paste(params$umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")
  } else if ( file.exists(paste(params$umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr_acmg.grch37.snvs_indels.tiers.tsv"), sep = "/")) ) {
    pcgr_file <- paste(params$umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr_acmg.grch37.snvs_indels.tiers.tsv"), sep = "/")
  } else {
    runPcgrChunk <- FALSE
  }
  
  if ( runPcgrChunk ) {
    ref_genes.list[["pcgr"]] <- read.table(pcgr_file, sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, fill = TRUE, quote = "")
    
    ##### Simplify the variants types
    ref_genes.list[["pcgr"]]$CONSEQUENCE <- gsub("_variant", "", ref_genes.list[["pcgr"]]$CONSEQUENCE)
    ref_genes.list[["pcgr"]]$CONSEQUENCE <- gsub("_", " ", ref_genes.list[["pcgr"]]$CONSEQUENCE)
    
    ##### Simplify tiers' annotations and AFs
    ref_genes.list[["pcgr"]]$TIER <- gsub("TIER ", "", ref_genes.list[["pcgr"]]$TIER)
    ref_genes.list[["pcgr"]]$AF_TUMOR <- round(ref_genes.list[["pcgr"]]$AF_TUMOR, digits = 2)
  } else {
    ref_genes.list[["pcgr"]] <- NULL
  }
  
  ##### Check if purple (CN) output file exists
  purple_file_1 <- paste(params$umccrise, "purple", paste0(umccrise, ".purple.gene.cnv"), sep = "/")
  purple_file_2 <- paste(params$umccrise, "purple", paste0(umccrise, ".purple.cnv.gene.tsv"), sep = "/")
  runPurpleChunk <- TRUE
  
  if ( file.exists(purple_file_1) ) {
    ref_genes.list[["purple"]] <- read.table(purple_file_1, sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, fill = TRUE, quote = "")
  } else if ( file.exists(purple_file_2) ) {
    ref_genes.list[["purple"]] <- read.table(purple_file_2, sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, fill = TRUE, quote = "")
    colnames(ref_genes.list[["purple"]]) <- sapply(colnames(ref_genes.list[["purple"]]), CapStr)
  } else {
    ref_genes.list[["purple"]] <- NULL
    runPurpleChunk <- FALSE
  }
  
  ##### Check if manta (structural variants (SVs)) file exists
  sv_file_1 <- paste(params$umccrise, "structural", paste0(umccrise, "-sv-prioritize-manta-pass.tsv"), sep = "/")
  sv_file_2 <- paste(params$umccrise, "structural", paste0(umccrise, "-manta.tsv"), sep = "/")
  runSVsChunk <- TRUE
  
  if ( file.exists(sv_file_1) ) {
    ref_genes.list[["manta"]] <- sv_prioritize_short(sv_file_1)
  } else if ( file.exists(sv_file_2) ) {
    ref_genes.list[["manta"]] <- sv_prioritize(sv_file_2)
    ref_genes.list[["manta"]] <- ref_genes.list[["manta"]][, c("Tier", "Event", "Gene", "Effect", "Detail", "Location", "AF", "CN chg", "SR", "PR", "CN", "Ploidy", "Transcript", "Other effects")]
    
    ##### Check if there are any SVs
    if ( !is.null(ref_genes.list[["manta"]]) ) {
      
      ##### Omit SVs without assigned gene
      ref_genes.list[["manta"]] <- ref_genes.list[["manta"]][ ref_genes.list[["manta"]]$Gene != "",  ]
    } else {
      ##### Create empty dataframe
      ref_genes.list[["manta"]] <- data.frame(matrix(ncol = 14, nrow = 0))
      colnames(ref_genes.list[["manta"]]) <- c("Tier", "Event", "Gene", "Effect", "Detail", "Location", "AF", "CN chg", "SR", "PR", "CN", "Ploidy", "Transcript", "Other effects")
    }
    
  } else {
    ref_genes.list[["manta"]] <- NULL
    runSVsChunk <- FALSE
  }
  
  ##### Extract subject ID (part of the umccrise output folder name) and add it to the MySQL insert command. This will overwrite argument passed to "--clinical_id" flag
  subjectID <- unlist(strsplit(tail(unlist(strsplit(params$umccrise, split='/', fixed=TRUE)), n=1), split='__', fixed=TRUE))[1]
  
} else {
  runPcgrChunk <- FALSE
  runPurpleChunk <- FALSE
  runSVsChunk <- FALSE
}

##### Read in OncoKB (http://oncokb.org) annotations
caner_genes_annot.list[["oncokb_clin_vars"]] <- read.table(paste(params$ref_data_dir, params$oncokb_clin_vars, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="")
caner_genes_annot.list[["oncokb_all_vars"]] <- read.table(paste(params$ref_data_dir, params$oncokb_all_vars, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="", fill = TRUE)

##### Read in CIViC (https://civicdb.org/) annotations
caner_genes_annot.list[["civic_var_summaries"]] <- read.table(paste(params$ref_data_dir, params$civic_var_summaries, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="", fill = TRUE)
caner_genes_annot.list[["civic_clin_evid"]] <- read.table(paste(params$ref_data_dir, params$civic_clin_evid, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="", fill = TRUE)

##### Read in Cancer Biomarkers database (https://www.cancergenomeinterpreter.org/biomarkers) annotations. This is mainly used to annotate reported fusion events
caner_genes_annot.list[["cancer_biomarkers_trans"]] <- read.table(paste(params$ref_data_dir, params$cancer_biomarkers_trans, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="", fill = TRUE)

##### Read in FusionGDB database (https://ccsm.uth.edu/FusionGDB/) used to annotate reported fusion events, with info about head and tail genes.
caner_genes_annot.list[["FusionGDB"]] <- read.table(paste(params$ref_data_dir, params$FusionGDB, sep="/"), sep="\t", as.is=TRUE, header=FALSE, row.names=NULL, quote="", fill = TRUE)
names(caner_genes_annot.list[["FusionGDB"]]) <- c("Hgene", "HgeneID", "Tgene", "TgeneID", "FGname", "FGID")


##### Add refenence cohort name to the sample name
if ( params$dataset_name_incl != "" ) {
  sample_name <- paste0(params$sample_name, "_", params$dataset)
} else {
  sample_name <- params$sample_name
}

##### Read in reference datasets and merge them with sample data. This part outputs a vector with first element containing the merged data and second element containing merged targets info
ref_dataset.list[[dataset]] <- combineDatasets(sample_name=sample_name, sample_counts=counts, ref_data=ref_dataset, report_dir = results_dir, dataset = dataset)
names(ref_dataset.list[[dataset]]) <- c("combined_data", "sample_annot")

##### Define internal, external and addition cancer group names based on the targets definition
int_cancer_group <- ref_dataset$int_ref[3]
ext_cancer_group <- ref_dataset$ext_ref[3]

if ( length(unique(ref_dataset.list[[dataset]][["sample_annot"]]$Target)) > 3 ) {
  
  add_cancer_group <- unique(ref_dataset.list[[dataset]][["sample_annot"]]$Target)[2]
} else {
  add_cancer_group <- NULL
}

##### Define the cancer group to be used to compare per-gene expression values and report in the summary tables
if ( dataset == "PAAD" || dataset == "PAAD-IPMN" || dataset == "PAAD-NET" || dataset == "PAAD-ACC" ) {
  comp_cancer_group <- int_cancer_group
} else {
  comp_cancer_group <- ext_cancer_group
}

##### Clean the space
rm(counts, tx2ensembl)
##### Initiate MySQL command to populate RNA-seq data portal
mysql_populate <- paste0("### MySQL command to insert data for sample \"", sample_name, "\"\nuse piedb;\nINSERT INTO RNAseq_reports ( ID ,Platform, PatientID, SampleID, Cancer, Source, Project, Report, PMID, Analysis, Summary, Date ) VALUES ( 1000000, \"RNA_seq\"")
mysql_populate_update <- "ON DUPLICATE KEY UPDATE ID=1000000 ,Platform=\"RNA_seq\""

##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ", \"", subjectID, "\", \"", sample_name, "\", \"", params$dataset , "\", \"", params$sample_source , "\", \"", params$project , "\", \"", paste0(sample_name, ".RNAseq_report.html"), "\", \"", sample_name, "\", \""  )
mysql_populate_update <-  paste0(mysql_populate_update, ", PatientID=\"", subjectID, "\", SampleID=\"", sample_name, "\", Cancer=\"", params$dataset , "\", Source=\"", params$sample_source , "\", Project=\"", params$project , "\", Report=\"", paste0(sample_name, ".RNAseq_report.html"),"\", PMID=\"", sample_name, "\", Analysis=\""  )
##### Prapare data for the treatment timeline plot
##### Search for row with clinical info for investigated patient
if ( !is.na(params$clinical_id) ) {
  sampleID.col <- grep(params$clinical_id, ref_dataset.list[[dataset]][["clinical_info"]])
} else if ( !is.na(params$subject_id) ) {
  sampleID.col <- grep(params$subject_id, ref_dataset.list[[dataset]][["clinical_info"]])
} else if ( !is.na(subjectID) ) {
  sampleID.col <- grep(subjectID, ref_dataset.list[[dataset]][["clinical_info"]])
}

runClinicalChunk <- FALSE

if ( length(sampleID.col) > 0 ) {
  
  ##### Identify column and row with patients details
  if ( !is.na(params$clinical_id) ) {
    sampleID.row <- grep(params$clinical_id, ref_dataset.list[[dataset]][["clinical_info"]][, sampleID.col])
  } else if ( !is.na(params$subject_id) ) {
    sampleID.row <- grep(params$subject_id, ref_dataset.list[[dataset]][["clinical_info"]][, sampleID.col])
  } else if ( !is.na(subjectID) ) {
    sampleID.row <- grep(subjectID, ref_dataset.list[[dataset]][["clinical_info"]][, sampleID.col])
  }
  
  clinical_info <- ref_dataset.list[[dataset]][["clinical_info"]][ sampleID.row, ]
  
  ##### Prepare data frame structure for plotting
  ##### Define treatment types
  treamtent.types <- make.names(c("NEOADJUVANT REGIMEN", "ADJUVANT REGIMEN", "FIRST LINE REGIMEN", "SECOND LINE REGIMEN", "THIRD LINE REGIMEN"))
  treamtent.types_simple <- c("Neoadjuvant", "Adjuvant", "1st line", "2nd line", "3rd line")
  treamtent.df <- data.frame(matrix(ncol = 4, nrow = 0))
  colnames(treamtent.df) <- c("Treatment", "Type", "Start", "End")

  for ( i in 1:length(treamtent.types) ) {
    
    ##### Identify treatment column number
    treamtent.types.col <- grep(paste0("^",treamtent.types[i], "$"), names(clinical_info))
    
    ##### Check how many treatments of particular type were used
    treamtent.types.details <- unlist(strsplit(clinical_info[, treamtent.types.col], split=',', fixed=TRUE))
    
    ##### Add start and end info for each treatment
    if ( any(!is.na(treamtent.types.details ), na.rm = FALSE) ) {
      for ( treatment in treamtent.types.details ) {
        
        treamtent.start <- clinical_info[, treamtent.types.col+1]
        treamtent.end <- clinical_info[, treamtent.types.col+2]

        ##### Use current data if treatment is still ongoing
        today <- as.character(Sys.Date())
        treamtent.end[ is.na(treamtent.end) ] <- today
        treamtent.tmp <- data.frame( treatment, treatment, treamtent.types_simple[i], treamtent.start, treamtent.end)
        treamtent.df <- rbind( treamtent.df, treamtent.tmp)
      }
    }
  }
  
  if ( nrow(treamtent.df) > 0 ) {
    ##### For security reasons (wrt plots that go to PIEdb), change the dates but preserve the duration of the treatments
    ##### Get the earliest treatment date and set it as day 0. Then, create fake start and end dates based on the treatment length
    day0 <- sort(treamtent.df$treamtent.start, decreasing = FALSE)[1]
    treamtents.length <- treamtent.df$treamtent.end - treamtent.df$treamtent.start
    treamtents.reset <- as.Date("2000-01-01") - day0
    treamtent.df$treamtent.start <- treamtent.df$treamtent.start + treamtents.reset
    treamtent.df$treamtent.end <- treamtent.df$treamtent.start + treamtents.length
    names(treamtent.df) <- c("Treatment", "Drug", "Type", "Start",  "End")
    
    ##### Create directory for timeline plot
    PlotsDir <- paste(results_dir, "clinical_info", sep = "/")
    if ( !file.exists(PlotsDir) ) {
      dir.create(PlotsDir, recursive=TRUE)
    }
        
    ##### Record the timeline plot. NOTE, the modified dates are used here
    treatment_timeline <- lares::plot_timeline(event = treamtent.df$Treatment, start = treamtent.df$Start, end = treamtent.df$End, label = NA, group = treamtent.df$Type, title = "", subtitle = "", save = FALSE)
    
    ##### Save the plot into png file. NOTE, the modified dates are used here. As default, the plot is saved as "cv_timeline"
    lares::plot_timeline(event = treamtent.df$Treatment, start = treamtent.df$Start, end = treamtent.df$End, label = NA, group = treamtent.df$Type, title = "", subtitle = "", save = TRUE, subdir = "clinical_info")
    
    #### Clear plots to free up some memory
    if(!is.null(dev.list())) invisible(dev.off())
    
    cv_timeline.png <- readPNG("clinical_info/cv_timeline.png", native = FALSE, info = FALSE)
    
    ##### Change the size of the timeline png plot and save it as "treatment_timeline.png"
    png::writePNG(cv_timeline.png, paste(PlotsDir, "treatment_timeline.png", sep="/"), dpi=300)
    #png(paste(PlotsDir, "treatment_timeline.png", sep="/"), width = 900, height = 600, pointsize = 0.0001, res=300)
    #plot(cv_timeline.png)
    #invisible(dev.off())
    
    #### Clear plots to free up some memory
    if(!is.null(dev.list())) invisible(dev.off())
    
    ##### Remove the original plot folder
    system("rm -rf clinical_info", ignore.stdout = TRUE, ignore.stderr = TRUE)
    
    runClinicalChunk <- TRUE
  }

##### Clean the space
rm(list = ls(pattern='^treamtent.*'))
rm(clinical_info, cv_timeline.png)
}
##### Combine UMCCR cancer gene list (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv) with OncoKB cancer genes
genes_cancer <- ref_genes.list[["genes_oncokb"]]
genes_cancer$UMCCR <- rep("No", nrow(genes_cancer))
genes_cancer$Oncogene <- rep("-", nrow(genes_cancer))
genes_cancer$TSG <- rep("-", nrow(genes_cancer))
genes_cancer$Fusion <- rep("-", nrow(genes_cancer))
genes_cancer$Germline <- rep("-", nrow(genes_cancer))

##### Flag Oncogenes, TSGs and fusion genes in the UMCCR cancer genes list (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
ref_genes.list[["genes_cancer"]]$germ <- gsub("TRUE", "Yes", ref_genes.list[["genes_cancer"]]$germ)
ref_genes.list[["genes_cancer"]]$germ <- gsub("FALSE", "-", ref_genes.list[["genes_cancer"]]$germ)
ref_genes.list[["genes_cancer"]]$fusion <- gsub("TRUE", "Yes", ref_genes.list[["genes_cancer"]]$fusion)
ref_genes.list[["genes_cancer"]]$fusion <- gsub("FALSE", "-", ref_genes.list[["genes_cancer"]]$fusion)
ref_genes.list[["genes_cancer"]]$tumorsuppressor <- gsub("TRUE", "Yes", ref_genes.list[["genes_cancer"]]$tumorsuppressor)
ref_genes.list[["genes_cancer"]]$tumorsuppressor <- gsub("FALSE", "-", ref_genes.list[["genes_cancer"]]$tumorsuppressor)
ref_genes.list[["genes_cancer"]]$oncogene <- gsub("TRUE", "Yes", ref_genes.list[["genes_cancer"]]$oncogene)
ref_genes.list[["genes_cancer"]]$oncogene <- gsub("FALSE", "-", ref_genes.list[["genes_cancer"]]$oncogene)

for ( gene in unlist(ref_genes.list[["genes_cancer"]]$symbol ) ) {
  ##### Check if the UMCCR genes is already reported in OncoKB
  if ( gene %in% genes_cancer$Hugo.Symbol ) {
   
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$UMCCR <- "Yes"
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Oncogene <- ref_genes.list[["genes_cancer"]]$oncogene[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$TSG <- ref_genes.list[["genes_cancer"]]$tumorsuppressor[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Fusion <- ref_genes.list[["genes_cancer"]]$fusion[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Germline <- ref_genes.list[["genes_cancer"]]$germ[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, 2] <- as.numeric(genes_cancer[ genes_cancer$Hugo.Symbol==gene, 2]) + 1
    
  ##### Add if not present
  } else {
    genes_cancer <- rbind(genes_cancer, c(gene, 1, "No", rep("", 8), "Yes"))
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Oncogene <- ref_genes.list[["genes_cancer"]]$oncogene[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$TSG <- ref_genes.list[["genes_cancer"]]$tumorsuppressor[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Fusion <- ref_genes.list[["genes_cancer"]]$fusion[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Germline <- ref_genes.list[["genes_cancer"]]$germ[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
  }
}

##### Make the data frame to look nicer
rownames(genes_cancer) <- genes_cancer$Hugo.Symbol
names(genes_cancer) <- c("Gene", "Gene panels no.", "OncoKB", "Oncogene (OncoKB)", "TSG (OncoKB)", "MSK-IMPACT", "MSK-HEME", "Foundation One", "Foundation One Heme", "Vogelstein", "Sanger CGC", "UMCCR", "Oncogene", "TSG", "Fusion", "Germline")
genes_cancer <- genes_cancer[,c("Oncogene", "TSG", "Fusion", "Germline", "Gene panels no.", "UMCCR", "OncoKB", "MSK-IMPACT", "MSK-HEME", "Foundation One", "Foundation One Heme", "Vogelstein", "Sanger CGC")]
genes_cancer[ genes_cancer=="No" ] <- "-"
genes_cancer[ genes_cancer=="" ] <- "-"

ref_genes.list[["genes_cancer"]] <- genes_cancer
ref_genes.list[["genes_oncokb"]] <- genes_cancer[ rownames(genes_cancer) %in% ref_genes.list[["genes_oncokb"]]$Hugo.Symbol, ]

##### Clean the space
rm(genes_cancer)
##### Record all genes of interest to make sure that these are not filtered out during read counts data processing
# PCGR annotation of mutated genes in given patient based on PCGR report, including only those with variants classified according to user-defined tier
if ( runPcgrChunk ) {
  ref_genes.list[["summary"]]$Mutated <- unique(ref_genes.list[["pcgr"]][ ref_genes.list[["pcgr"]]$TIER %in% c(1:params$pcgr_tier), ]$SYMBOL)
  
  ##### Include splice region variants
  if ( params$pcgr_splice_vars ) {
    ref_genes.list[["summary"]]$Mutated <- unique( c(ref_genes.list[["summary"]]$Mutated,  ref_genes.list[["pcgr"]][ grepl("NONCODING.*splice region", paste0(ref_genes.list[["pcgr"]]$TIER, ".", ref_genes.list[["pcgr"]]$CONSEQUENCE), fixed = FALSE), ]$SYMBOL) )
  }
  
  ##### Remove NAs
  if ( length(ref_genes.list[["summary"]]$Mutated) > 0 ) {
    ref_genes.list[["summary"]]$Mutated <- ref_genes.list[["summary"]]$Mutated[ !(is.na(ref_genes.list[["summary"]]$Mutated)) ]
  } else {
    ref_genes.list[["summary"]]$Mutated <- NULL
  }
}
    
# ARRIBA and PIZZLY annotation of gene fusion events detected in given patient based on PIZZLY results
if ( runFusionChunk ) {
  
  if ( runArribaChunk ) {
    ref_genes.list[["summary"]]$Fusion <- unique(c(as.character(ref_genes.list[["arriba"]]$X.gene1), as.character(ref_genes.list[["arriba"]]$gene2)))
  } else {
    ref_genes.list[["summary"]]$Fusion <- NULL
  }
  
  if ( runPizzlyChunk ) {
    ref_genes.list[["summary"]]$Fusion <- unique(c(ref_genes.list[["summary"]]$Fusion, as.character(ref_genes.list[["pizzly"]]$geneA.name), as.character(ref_genes.list[["pizzly"]]$geneB.name)))
  }
  
  if ( runDragenFusionChunk ) {
    ref_genes.list[["summary"]]$Fusion <- unique(c(ref_genes.list[["summary"]]$Fusion, as.character(ref_genes.list[["dragenFusion"]]$gene1), as.character(ref_genes.list[["dragenFusion"]]$gene2)))
  }
  
  ##### Remove NAs
  if ( length(ref_genes.list[["summary"]]$Mutated) > 0 ) {
    ref_genes.list[["summary"]]$Fusion <- ref_genes.list[["summary"]]$Fusion[ !(is.na(ref_genes.list[["summary"]]$Fusion)) ]
  } else {
    ref_genes.list[["summary"]]$Fusion <- NULL
  }
}

# MANTA annotation of structural variants (SVs) with affected genes in given patient based on MANTA results
if ( runSVsChunk ) {
  ref_genes.list[["summary"]]$SV <- ref_genes.list[["manta"]]
  ref_genes.list[["summary"]]$SV <- ref_genes.list[["summary"]]$SV[ ref_genes.list[["summary"]]$SV$Gene != "",  ]$Gene
  # ...and distinguish classified by MANTA as fusion genes
  
  ##### Remove NAs
  if ( length(ref_genes.list[["summary"]]$SV) > 0 ) {
    ref_genes.list[["summary"]]$SV <- unique(unlist(strsplit(ref_genes.list[["summary"]]$SV, split='&', fixed=TRUE)))
    ref_genes.list[["summary"]]$SV <- ref_genes.list[["summary"]]$SV[ !(is.na(ref_genes.list[["summary"]]$SV)) ]
  } else {
    ref_genes.list[["summary"]]$SV <- NULL
  }
}

# PURPLE annotation of copy-number (CN) altered genes in given patient based on PURPLE results, including only those with CN values meeting user-defined thresholds
if ( runPurpleChunk ) {
  ref_genes.list[["summary"]]$CN <- ref_genes.list[["purple"]]
  ref_genes.list[["summary"]]$CN <- ref_genes.list[["summary"]]$CN[ ref_genes.list[["summary"]]$CN %!in% "",  ]
  
  ##### Get the CN mean
  ref_genes.list[["summary"]]$CN$MeanCopyNumber <- rowMeans(cbind(ref_genes.list[["summary"]]$CN$MinCopyNumber, ref_genes.list[["summary"]]$CN$MaxCopyNumber))
    
  ##### Deal with negative CN values
  ref_genes.list[["summary"]]$CN$MeanCopyNumber[ ref_genes.list[["summary"]]$CN$MeanCopyNumber < 0 ] <- 0

  ##### Limit the data to include only cancer genes
  ref_genes.list[["summary"]]$CN <- ref_genes.list[["summary"]]$CN[ ref_genes.list[["summary"]]$CN$Gene %in% rownames(ref_genes.list[["genes_cancer"]]), ]

  ##### Keep only altered genes with CN values below loss threshold (default 5th percentile) and above gain threshold (default 95th percentile)
  if ( params$cn_loss == 5 && params$cn_gain == 95 ) {
    cn_data.all.percent <- quantile(ref_genes.list[["summary"]]$CN$MeanCopyNumber, probs = seq(0, 1, .05), na.rm = TRUE)
    cn_bottom <- round(cn_data.all.percent[2], digits = 2)
    cn_top <- round(cn_data.all.percent[20], digits = 2)
  
  } else {
    cn_bottom <- params$cn_loss
    cn_top <- params$cn_gain
  }
  
  ##### If the difference is 0 then increase/decrease threshold by 1
  if  ( abs(cn_top-cn_bottom) == 0 ) {
    cn_top <- cn_top + 1
    cn_bottom <- cn_bottom - 1
  }
  
  ref_genes.list[["summary"]]$CN <- unique(ref_genes.list[["summary"]]$CN[ ref_genes.list[["summary"]]$CN$MeanCopyNumber <= cn_bottom | ref_genes.list[["summary"]]$CN$MeanCopyNumber >= cn_top, ]$Gene)
  
  ##### Remove NAs
  if ( length(ref_genes.list[["summary"]]$CN) > 0 ) {
    ref_genes.list[["summary"]]$CN <- ref_genes.list[["summary"]]$CN[ !(is.na(ref_genes.list[["summary"]]$CN)) ]
  } else {
    ref_genes.list[["summary"]]$CN <- NULL
  }
}

# Immune reponse markers
ref_genes.list[["summary"]]$Immune <- unique(ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL)

if ( params$immunogram ) {
  ref_genes.list[["summary"]]$Immune <- unique(c(ref_genes.list[["summary"]]$Immune, ref_genes.list[["genes_immune"]]$immunogram$SYMBOL))
  
  ##### Remove NAs
  ref_genes.list[["summary"]]$Immune <- ref_genes.list[["summary"]]$Immune[ !(is.na(ref_genes.list[["summary"]]$Immune)) ]
}

# HRD (homologous recombination deficiency) genes
ref_genes.list[["summary"]]$HRD <- unique(ref_genes.list[["genes_hrd"]]$SYMBOL)

##### Remove NAs
ref_genes.list[["summary"]]$HRD <- ref_genes.list[["summary"]]$HRD[ !(is.na(ref_genes.list[["summary"]]$HRD)) ]
  
# Cancer genes derived from UMCCR Cancer Gene list (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv) and from OncoKB portal (http://oncokb.org/#/cancerGenes)
ref_genes.list[["summary"]]$Cancer <- rownames(ref_genes.list[["genes_cancer"]])

##### Remove NAs
ref_genes.list[["summary"]]$Cancer <- ref_genes.list[["summary"]]$Cancer[ !(is.na(ref_genes.list[["summary"]]$Cancer)) ]

##### Record all genes of interest
genes2keep <- unique( unlist(ref_genes.list[["summary"]]) )
##### Get gene symbols for the genes of interest. These genes will not be filtered out due to low/insufficient expression
##### Get genes annotation and genomic locations
edb <- eval(parse(text = paste0("EnsDb.Hsapiens.v", params$ensembl_version)))
  
##### Get keytypes for gene SYMBOL
keys <- keys(edb, keytype="GENEID")
  
##### Get genes genomic coordiantes
gene_info <- ensembldb::select(edb, keys=keys, columns=c("GENEID", "GENENAME"), keytype="GENEID")
names(gene_info) <- gsub("GENEID", "ENSEMBL", names(gene_info))
names(gene_info) <- gsub("GENENAME", "SYMBOL", names(gene_info))
  
##### Limit genes annotation to the gene of interest
genes2keep <- gene_info[ gene_info$SYMBOL %in% genes2keep,  ]
  
##### Remove rows with duplicated ENSEMBL IDs
genes2keep = genes2keep[!duplicated(genes2keep$ENSEMBL),]
rownames(genes2keep) <- genes2keep$ENSEMBL

##### Remove rows with duplicated gene symbols (Y_RNAs, SNORs, LINC0s etc). Preferably select ENSEMBL ID that is used in the count data
genes2keep.combined_data <- genes2keep[ genes2keep$ENSEMBL %in% rownames(ref_dataset.list[[dataset]]$combined_data), ]
genes2keep <- genes2keep[ genes2keep$SYMBOL %!in% genes2keep.combined_data$SYMBOL, ]
genes2keep <-  genes2keep[!duplicated(genes2keep$SYMBOL),]
genes2keep <- rbind(genes2keep.combined_data, genes2keep)

##### Add column to store info about filtered genes
genes2keep$EXP <- TRUE

##### Clean the space
rm(edb, keys, gene_info)
suppressMessages(library(plotly))
##### Generate bar-plot for library size. The colours indicate sample groups, as provided in *Target* column in the sample annotation file

data <- ref_dataset.list[[dataset]][["combined_data"]]
target <- ref_dataset.list[[dataset]][["sample_annot"]]
target$Target[ target$Target==sample_name ] <- "Patient"
rownames(target)[ rownames(target)==sample_name ] <- "Patient"

##### Change the datasets levels order
target$Target <- factor(target$Target, levels = unique(target$Target))

##### Assigne colours to targets and datasets
targets.colour <- getColours(target$Target)

##### Prepare data frame
data.df <- data.frame(rownames(target), as.numeric(colSums(data)*1e-6), target$Target)
colnames(data.df) <- c("Sample", "Library_size", "Target")

##### The default order will be alphabetized unless specified as below
data.df$Sample <- factor(data.df$Sample, levels = data.df[["Sample"]])

library_size <- plot_ly(data.df, x = ~Sample, y = ~Library_size, color = ~Target, colors = targets.colour[[1]], type = 'bar', width = 800, height = 400) %>%
  layout(title = "", xaxis = list( tickfont = list(size = 10), title = "", showticklabels = FALSE), yaxis = list(title = "Library size (millions)"), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F, showlegend=TRUE, legend = list(orientation = 'h', y = max(data.df$Library_size), bgcolor = "white"))

##### Create directory for input data plots
PlotsDir <- paste(results_dir, "InputDataPlots", sep = "/")
if ( !file.exists(PlotsDir) ) {
  dir.create(PlotsDir, recursive=TRUE)
}

##### Save interactive plot as html file
saveWidgetFix(library_size, file = paste(PlotsDir, "library_size.html", sep = "/"))
  
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)
##### Filtering to remove low expressed genes. For differential expression and related analyses, gene expression is rarely considered at the level of raw counts since libraries sequenced at a greater depth will result in higher counts. Rather, it is common practice to transform raw counts onto a scale that accounts for such library size differences. Genes with very low counts across all libraries provide little evidence for differential expression. In the biological point of view, a gene must be expressed at some minimal level before it is likely to be translated into a protein or to be biologically important. In addition, the pronounced discretenes of these counts interferes with some of the statistical approximations that are used later in the pipeline. These genes should be filtered out prior to further analysis. Users should filter with CPM rather than filtering on the counts directly, as the latter does not account for differences in library sizes between samples. For instance for the CPM-transformed data we keep only genes that have CPM of 1

##### Transformation to CPM or TPM scale (see these blogs for details https://www.rna-seqblog.com/rpkm-fpkm-and-tpm-clearly-explained/ and https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/ ).  CPM = Counts Per Million,  TPM = Transcripts Per Kilobase Million. 

##### For counts data processing consider the investigated sample and internal reference cohort as one group  (regardless of the investigated patient tissie origin), and TCGA data (of any cancer type) as another group. This is to facilitate batch-effects (related with technical aspects) correction process
target_mod <- ref_dataset.list[[dataset]][["sample_annot"]]
target_mod$Dataset <- gsub(sample_name, int_cancer_group, target_mod$Dataset)
targets_mod.list <- unique(target_mod$Dataset)

##### Create lists with processed data each group
y <- vector("list", length(targets_mod.list))
names(y) <- targets_mod.list

##### Keep info about samples with the lowest and greates counts for defined CPM threshold
cpm.min <- round(min(as.numeric(colSums(ref_dataset.list[[dataset]][["combined_data"]])*1e-6)), digits=0)
cpm.max <- round(max(as.numeric(colSums(ref_dataset.list[[dataset]][["combined_data"]])*1e-6)), digits=0)

#### For each group...
for ( group in targets_mod.list ) {
    target <- target_mod[ target_mod$Dataset==group, ]
    data <- ref_dataset.list[[dataset]][["combined_data"]]
    data <- data[ , target_mod$Dataset==group]
    
  ##### CPM transformation and filtering
  if ( params$filter && params$transform == "CPM" ) {
    
    ##### Create EdgeR DGEList object
    y[[group]] <- edgeR::DGEList(counts=data,  group=target$Dataset)
    
    ##### Keep genes with CPM of at least 1 in more than 10% of samples
    filter.threshold <- 1
    keep <- rowSums(edgeR::cpm(y[[group]])>filter.threshold) >= ncol(data)/10
    
    ##### Note which genes of interest are not expressed
    genes2keep$EXP[ rownames(genes2keep) %!in% names(keep) ] <- FALSE
    
    ##### Keep the genes of interest too
    keep[ names(keep) %in% rownames(genes2keep) ] <- TRUE
    y[[group]]$filtered <- y[[group]][keep, , keep.lib.sizes=FALSE]
    
    ##### Transform the raw-scale to CPM. Add small offset to each observation to avoid taking log of zero
    y[[group]]$transformed <- edgeR::cpm(y[[group]], normalized.lib.sizes=FALSE, log=params$log, prior.count=0.25)
    y[[group]]$filtered.transformed <- edgeR::cpm(y[[group]]$filtered, normalized.lib.sizes=FALSE, log=params$log, prior.count=0.25)
  
  ##### CPM transformation without filtering
  } else if ( !params$filter && params$transform == "CPM" ) {
    ##### Create EdgeR DGEList object
    y[[group]] <- edgeR::DGEList(counts=data,  group=target$Dataset)
    
    ##### Transform the raw-scale to CPM. Add small offset to each observation to avoid taking log of zero
    y[[group]]$transformed <- edgeR::cpm(y[[group]], normalized.lib.sizes=FALSE, log=params$log, prior.count=0.25)
    
  ##### TPM data transformation. We can convert RPKM to TPM in two different ways: from pre-calculated RPKM, by diving by the sum of RPKM values, or directly from the normalized counts. Here we calculate TPM starting from RPKM values computed using edgeR's rpkm function ( from http://luisvalesilva.com/datasimple/rna-seq_units.html )
  ##### TPM transformation with filtering
  } else if ( params$filter && params$transform == "TPM" ) {
    
    ##### Get genes lengths
    edb <- eval(parse(text = paste0("EnsDb.Hsapiens.v", params$ensembl_version)))
    gene.length <- lengthOf(edb, filter = GeneIdFilter(rownames(data)))
    
    ##### Check for which genes the lenght info is not available and remove them from the data
    genes.no_length <- rownames(data)[ rownames(data) %!in% names(gene.length)]
    data <- data[ rownames(data) %!in% genes.no_length, ]
    
    ##### Create EdgeR DGEList object
    y[[group]] <- edgeR::DGEList(counts=data,  group=target$Dataset)
    
    ##### Convert data into RPKM
    y[[group]]$transformed <- edgeR::rpkm(y[[group]], gene.length = gene.length, normalized.lib.sizes=FALSE, log=FALSE)
    
    ##### ... and then to TPM scale. Add small offset to each observation to avoid taking log of zero
    if ( params$log ) {
      y[[group]]$transformed <- log2(tpm_from_rpkm(y[[group]]$transformed+0.25))
      
      ##### Keep genes with TPM of at least 1 in more than 10% of samples
      filter.threshold <- 1+0.25
      keep <- rowSums(y[[group]]$transformed > filter.threshold) >= ncol(y[[group]]$transformed)/10
      
      ##### Note which genes of interest are not expressed
      genes2keep$EXP[ rownames(genes2keep) %!in% names(keep) ] <- FALSE
    
      ##### Keep the genes of interest too
      keep[ names(keep) %in% rownames(genes2keep) ] <- TRUE
      y[[group]]$filtered <- y[[group]]$counts[keep, ]
      y[[group]]$filtered.transformed <- y[[group]]$transformed[keep, ]
   
    } else {
      y[[group]]$transformed <- tpm_from_rpkm(y[[group]]$transformed)
      
      ##### Keep genes with TPM of at least 1 in more than 10% of samples
      filter.threshold <- 1
      keep <- rowSums(y[[group]]$transformed > filter.threshold) >= ncol(y[[group]]$transformed)/10
      
      ##### Note which genes of interest are not expressed
      genes2keep$EXP[ rownames(genes2keep) %!in% names(keep) ] <- FALSE
    
      ##### Keep the genes of interest too
      keep[ names(keep) %in% rownames(genes2keep) ] <- TRUE
      y[[group]]$filtered <- y[[group]]$counts[keep, ]
      y[[group]]$filtered.transformed <- y[[group]]$transformed[keep, ]
    }
  
  ##### TPM transformation without filtering
  } else if ( !params$filter && params$transform == "TPM" ) {
    
    ##### Get genes lengths
    edb <- eval(parse(text = paste0("EnsDb.Hsapiens.v", params$ensembl_version)))
    gene.length <- lengthOf(edb, filter = GeneIdFilter(rownames(data)))
    
    ##### Check for which genes the lenght info is not available and remove them from the data
    genes.no_length <- rownames(data)[ rownames(data) %!in% names(gene.length)]
    data <- data[ rownames(data) %!in% genes.no_length, ]
    
    ##### Create EdgeR DGEList object
    y[[group]] <- edgeR::DGEList(counts=data,  group=target$Dataset)
    
    ##### Convert data into RPKM
    y[[group]]$transformed <- edgeR::rpkm(y[[group]], gene.length = gene.length, normalized.lib.sizes=FALSE, log=FALSE)
    
    ##### ... and then to TPM scale. Add small offset to each observation to avoid taking log of zero
    if ( params$log ) {
      y[[group]]$transformed <- log2(tpm_from_rpkm(y[[group]]$transformed+0.25))
    } else {
      y[[group]]$transformed <- tpm_from_rpkm(y[[group]]$transformed)
    }
  }
}

##### Now combine DGEList objects created for each group
y[["comb"]]$transformed <- cbind(y[[targets_mod.list[1]]]$transformed, y[[targets_mod.list[2]]]$transformed)
y[["comb"]]$samples <- rbind(y[[targets_mod.list[1]]]$samples, y[[targets_mod.list[2]]]$samples)

if ( params$filter ) {
  
  ##### Keep only genes present in all sets
  genes_mod <- intersect(rownames(y[[targets_mod.list[1]]]$filtered), rownames(y[[targets_mod.list[2]]]$filtered))
  y[[targets_mod.list[1]]]$filtered <- y[[targets_mod.list[1]]]$filtered[ rownames(y[[targets_mod.list[1]]]$filtered) %in% genes_mod, ]
  y[[targets_mod.list[2]]]$filtered <- y[[targets_mod.list[2]]]$filtered[ rownames(y[[targets_mod.list[2]]]$filtered) %in% genes_mod, ]
  y[[targets_mod.list[1]]]$filtered.transformed <- y[[targets_mod.list[1]]]$filtered.transformed[ rownames(y[[targets_mod.list[1]]]$filtered.transformed) %in% genes_mod, ]
  y[[targets_mod.list[2]]]$filtered.transformed <- y[[targets_mod.list[2]]]$filtered.transformed[ rownames(y[[targets_mod.list[2]]]$filtered.transformed) %in% genes_mod, ]
 
  y[["comb"]]$filtered <- cbind(y[[targets_mod.list[1]]]$filtered, y[[targets_mod.list[2]]]$filtered)
  y[["comb"]]$filtered.transformed <- cbind(y[[targets_mod.list[1]]]$filtered.transformed, y[[targets_mod.list[2]]]$filtered.transformed)
}

##### Clean the space
rm(target, target_mod, genes_mod, keep)
##### Assign colours to targets and datasets
target <- ref_dataset.list[[dataset]][["sample_annot"]]
targets.colour <- getColours(target$Target)
  
##### Collect the most extreme density values for set the x-axis and y-axis boundaries
den.x <- density(y[["comb"]]$transformed[,1])$x
den.y <- density(y[["comb"]]$transformed[,1])$y
  
for (i in 2:ncol(y[["comb"]]$transformed)) {
  den <- density(y[["comb"]]$transformed[,i])
  den.x <- sort(c(den.x, den$x))
  den.y <- sort(c(den.y, den$y))
}

##### Plot read counts against transformed data
if ( params$filter ) {
  suppressMessages(library(plotly))
  
  ##### Organise the data into data frame
  if ( params$log ) {
    data.df <- as.data.frame(cbind( exp(y[["comb"]]$transformed[,ncol(y[["comb"]]$transformed)]), ref_dataset.list[[dataset]][["combined_data"]][,ncol(ref_dataset.list[[dataset]][["combined_data"]])]))
    names(data.df) <- c("Transformed", "Counts")
    data.df$Transformed <- log(data.df$Transformed)
    
  } else {
     data.df <- as.data.frame(cbind( y[["comb"]]$transformed[,ncol(y[["comb"]]$transformed)], ref_dataset.list[[dataset]][["combined_data"]][,ncol(ref_dataset.list[[dataset]][["combined_data"]])]))
    names(data.df) <- c("Transformed", "Counts")
  }
  
  ##### Keep only genes with read counts below the 99th percentile
  data.df <- data.df[ data.df$Counts < quantile(data.df$Counts, 0.99), ]
  
  ##### Keep only every 25th genes to reduce the size of the plot
  data.df <- data.df[ seq(1,nrow(data.df), by=25), ]
  
  ##### Generate plot for filtered data
  counts_vs_transformed <- plot_ly( data.df, x = ~Transformed, y = ~Counts, width = 800, height = 300, color = I('black'), marker = list(size = 5), type="scatter", mode = "markers", name = paste0(params$transform, " / Counts (Patient)") ) %>% 
    add_trace(x = c(filter.threshold, filter.threshold), y= c(0, max(data.df$Counts)), mode = "lines", color = I("red"), name = "Filtering threshold") %>%
    
    layout(title = "", xaxis = list(title = paste0(params$transform, "s")), yaxis = list(title = "Counts"), showlegend=TRUE)
  
  ##### Save interactive plot as html file
  saveWidgetFix(counts_vs_transformed, file = paste(PlotsDir, "counts_vs_transformed.html", sep = "/"))

  ##### Detach plotly package. Otherwise it clashes with other graphics devices
  detach("package:plotly", unload=FALSE)
  
  if ( !is.null(add_cancer_group) ) {
    legend <- c(ext_cancer_group, add_cancer_group, int_cancer_group, "Patient")
  } else {
    legend <- c(ext_cancer_group, int_cancer_group, "Patient")
  }
  
  ##### Before filtering
  par(mfrow=c(1,2))
  plot(density(y[["comb"]]$transformed[,1]), lwd=2, xlim=c(den.x[1],max(data.df$Transformed)), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed data (unfiltered)", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$transformed)){
    den <- density(y[["comb"]]$transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], bty="n", bg = "transparent")
  
  data_transformation_nonfiltered <- recordPlot()
  
  ##### After filtering
  plot(density(y[["comb"]]$filtered.transformed[,1]), lwd=2, xlim=c(den.x[1],max(data.df$Transformed)), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed and filtered data", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$filtered.transformed)){
    den <- density(y[["comb"]]$filtered.transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], bty="n", bg = "transparent")
  
  data_transformation_filtered <- recordPlot()
  
  ##### Save the plot as png file
  png(paste0(PlotsDir, "/filtering.png"), width=900, height=400, pointsize = 14)
  par(mfrow=c(1,2))
  
  ##### Before filtering
  plot(density(y[["comb"]]$transformed[,1]), lwd=2, xlim=c(den.x[1],den.x[length(den.x)]), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed data (unfiltered)", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$transformed)){
    den <- density(y[["comb"]]$transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], cex = 0.7, bty="n", bg = "transparent")
  
  ##### After filtering
  plot(density(y[["comb"]]$filtered.transformed[,1]), lwd=2, xlim=c(den.x[1],den.x[length(den.x)]), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed and filtered data", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$filtered.transformed)){
    den <- density(y[["comb"]]$filtered.transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], cex = 0.7, bty="n", bg = "transparent")
  invisible(dev.off())
  
##### Without filtering
} else {
  plot(density(y[["comb"]]$transformed[,1]), lwd=2, xlim=c(den.x[1],den.x[length(den.x)]), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed data (unfiltered)", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$transformed)){
    den <- density(y[["comb"]]$transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], bty="n", bg = "transparent")
  
  data_transformation_nonfiltered <- recordPlot()
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
  
  ##### Save the plot as png file
  png(paste0(PlotsDir, "/filtering.png"), width=900, height=400, pointsize = 14)
  plot(density(y[["comb"]]$transformed[,1]), lwd=2, xlim=c(den.x[1],den.x[length(den.x)]), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed data (unfiltered)", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$transformed)){
    den <- density(y[["comb"]]$transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], cex = 0.7, bty="n", bg = "transparent")
  invisible(dev.off())
}
##### Clean the space
rm(data, data.df, target, den.x, den.y)
##### During the sample preparation or sequencing process, external factors that are not of biological interest can affect the expression of individual samples. For example, samples processed in the first batch of an experiment can have higher expression overall when compared to samples processed in a second batch. It is assumed that all samples should have a similar range and distribution of expression values. Normalisation for sample-specific effects is required to ensure that the expression distributions of each sample are similar across the entire experiment.

##### TMM normalsation. Trimmed mean of M-values (https://www.ncbi.nlm.nih.gov/pubmed/20196867) (TMM) is performed using the calcNormFactors function in edgeR. The normalisation factors calculated here are used as a scaling factor for the library sizes. TMM is the recommended for most RNA-Seq data where the majority (more than half) of the genes are believed not differentially expressed between any pair of the samples. It adjusts for RNA composition effect, calculates scaling factors for the library sizes with calcNormFactors function using trimmed mean of M-values (TMM) between each pair of samples. Note, that the raw read counts are used to calculate the normalisation factors
  
#### For each group...
for ( group in targets_mod.list ) {
  if ( params$transform == "CPM" ) {
    
    ##### Calculate normalization factors and transformations from the raw-scale to CPM and normalisation using user-defined method
    if ( params$filter ) {
      y[[group]]$noNorm <- y[[group]]$filtered.transformed
      y[[group]]$filtered$samples["norm.factors"] <- edgeR::calcNormFactors(y[[group]]$filtered, method = params$norm)$samples["norm.factors"]
      y[[group]]$norm <- edgeR::cpm(y[[group]]$filtered, normalized.lib.sizes=TRUE, log=params$log, prior.count=0.25)
    
    } else {
      y[[group]]$noNorm <- y[[group]]$transformed
      y[[group]]$samples["norm.factors"] <- edgeR::calcNormFactors(y[[group]], method = params$norm)$samples["norm.factors"]
      y[[group]]$norm <- edgeR::cpm(y[[group]], normalized.lib.sizes=TRUE, log=params$log, prior.count=0.25)
    }
    
  ##### Quantile normalsation (from https://www.biostars.org/p/296992/ )
  } else if ( params$transform == "TPM" ) {
    
    ##### Normalisation using quantile method
    if ( params$filter ) {
      y[[group]]$noNorm <- y[[group]]$filtered.transformed
      y[[group]]$filtered.transformed <- data.matrix(y[[group]]$filtered.transformed) 
      
      if ( tolower(params$norm) != "none" ) {
        y[[group]]$norm  <- normalize.quantiles(y[[group]]$filtered.transformed, copy = TRUE)
        colnames(y[[group]]$norm) <- colnames(y[[group]]$filtered.transformed)
        rownames(y[[group]]$norm) <- rownames(y[[group]]$filtered.transformed)
      } else {
        y[[group]]$norm  <- y[[group]]$filtered.transformed
      }
    } else {
      y[[group]]$noNorm <- y[[group]]$transformed
      y[[group]]$transformed <- data.matrix(y[[group]]$transformed)
      
      if ( tolower(params$norm) != "none" ) {
        y[[group]]$norm  <- normalize.quantiles(y[[group]]$transformed, copy = TRUE)
        colnames(y[[group]]$norm) <- colnames(y[[group]]$transformed)
        rownames(y[[group]]$norm) <- rownames(y[[group]]$transformed)
      } else {
        y[[group]]$norm  <- y[[group]]$transformed
      }
    }
  }
}  

##### Combine DGEList objects created for each group
y[["comb"]]$noNorm <- cbind(y[[targets_mod.list[1]]]$noNorm, y[[targets_mod.list[2]]]$noNorm)
y[["comb"]]$norm <- cbind(y[[targets_mod.list[1]]]$norm, y[[targets_mod.list[2]]]$norm)

if ( tolower(params$norm) != "none" ) {
  ref_dataset.list[[dataset]][["combined_data_processed"]] <- y[["comb"]]$norm
} else {
  ref_dataset.list[[dataset]][["combined_data_processed"]] <- y[["comb"]]$noNorm
}

##### Clean the space
rm(targets_mod.list)
##### Plot expression distribution of samples for unnormalised and normalised data
par(mfrow=c(2,1), mar=c(2, 5, 3, 2))

##### Unnormalised data
boxplot(y[["comb"]]$noNorm, las=2, col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
title(main="Unnormalised data", ylab=params$transform)
legend("topright", legend=legend, fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")

data_nonnormalised <- recordPlot()

##### Normalised data
boxplot(y[["comb"]]$norm, las=2, col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
title(main=paste0("Normalised data (", params$norm, ")"), ylab=params$transform)
legend("topright", legend=legend, fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
data_normalised <- recordPlot()

##### Save the plot as png file
png(paste0(PlotsDir, "/normalisation.png"), width=900, height=700, pointsize = 14)
par(mfrow=c(2,1), mar=c(2, 5, 3, 2))
  
##### Unnormalised data
boxplot(y[["comb"]]$noNorm, las=2, col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
title(main="Unnormalised data", ylab=params$transform)
legend("topright", legend=legend, fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", cex = 0.7, box.col="transparent")
  
##### Normalised data
boxplot(y[["comb"]]$norm, las=2, col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
title(main=paste0("Normalised data (", params$norm, ")"), ylab=params$transform)
legend("topright", legend=legend, fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", cex = 0.7, box.col="transparent")
invisible(dev.off())

##### Clean the space
rm(den, y)
##### The strategy for correcting data for batch effects is to consider the investigated sample and internal reference cohort as one group (batch) (regardless of the investigated patient tissue origin), and TCGA data (of any cancer type) as another batch. The objective is to remove as much as possible data variation due to technical factors.
batches <- as.character(ref_dataset.list[[dataset]][["sample_annot"]]$Dataset)

##### Change the sample dataset name to internal reference cohort
batches[ match(sample_name, batches) ] <- int_cancer_group

##### Perform batch-effect correctrion using limma
ref_dataset.list[[dataset]][["batch_effect_corrected"]] <- limma::removeBatchEffect(ref_dataset.list[[dataset]][["combined_data_processed"]], batch = batches)
suppressMessages(library(plotly))

##### Perform principal component analysis (PCA) using combined-only data and batch-effect corrected data
##### Loop through combined datasets and perform PCA
for ( dataset in names(ref_dataset.list) ) {
  target <- ref_dataset.list[[dataset]][["sample_annot"]]
  target$Dataset <- gsub(sample_name, "Patient", target$Dataset)
  target$Target <- gsub(sample_name, "Patient", target$Target)
  
  if ( params$batch_rm ) {
    ref_dataset.list[[dataset]][["pca_combined_data_processed"]] <- pca(data = ref_dataset.list[[dataset]][["combined_data_processed"]], targets = target, title = "Before batch-effects correction", report_dir = results_dir, suffix = "_before_batch_rm")
    
    ref_dataset.list[[dataset]][["pca_batch_effect_corrected"]] <- pca(data = ref_dataset.list[[dataset]][["batch_effect_corrected"]], targets = target, title = "After batch-effects correction", report_dir = results_dir, suffix = "_after_batch_rm")
    
    ref_dataset.list[[dataset]][["data_to_report"]] <- ref_dataset.list[[dataset]][["batch_effect_corrected"]]
    
  } else {
    ref_dataset.list[[dataset]][["pca_combined_data_processed"]] <- pca(data = ref_dataset.list[[dataset]][["combined_data_processed"]], targets = target, report_dir = results_dir)
    
    ref_dataset.list[[dataset]][["data_to_report"]] <- ref_dataset.list[[dataset]][["combined_data_processed"]]
  }
}
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
##### Generate relative log expression (RLE) plot using combined-only data and batch-effect corrected data
##### Loop through combined datasets and generate RLE plot
for ( dataset in names(ref_dataset.list) ) {
  target <- ref_dataset.list[[dataset]][["sample_annot"]]
  target$Dataset <- gsub(sample_name, "Patient", target$Dataset)
  target$Target <- gsub(sample_name, "Patient", target$Target)
  
  if ( params$batch_rm ) {
    par(mfrow=c(2,1), mar=c(2, 5, 3, 2))
    
    ##### Before batch-effects correction
    plotRLE(ref_dataset.list[[dataset]][["combined_data_processed"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="Before batch-effects correction", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
    
    ref_dataset.list[[dataset]][["rle_combined_data_processed"]] <- recordPlot()
    
    ##### After batch-effects correction
    plotRLE(ref_dataset.list[[dataset]][["batch_effect_corrected"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="After batch-effects correction", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
    
    ref_dataset.list[[dataset]][["rle_batch_effect_corrected"]] <- recordPlot()
    
    
    ##### Save the plot as png file
    png(paste0(PlotsDir, "/rle.png"), width=900, height=700, pointsize = 14)
    par(mfrow=c(2,1), mar=c(2, 5, 3, 2))
  
    ##### Before batch-effects correction
    plotRLE(ref_dataset.list[[dataset]][["combined_data_processed"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="Before batch-effects correction", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
  
    ##### After batch-effects correction
    plotRLE(ref_dataset.list[[dataset]][["batch_effect_corrected"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="After batch-effects correction", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
    invisible(dev.off())

  } else {
    plotRLE(ref_dataset.list[[dataset]][["combined_data_processed"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
    
    ref_dataset.list[[dataset]][["rle_combined_data_processed"]] <- recordPlot()
    
    ##### Save the plot as png file
    png(paste0(PlotsDir, "/rle.png"), width=900, height=450, pointsize = 14)
  
    plotRLE(ref_dataset.list[[dataset]][["combined_data_processed"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
    invisible(dev.off())
  }
}
#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())

##### Clean the space
rm(targets.colour, den, y)
##### Loop through combined, BUT NOT PROCESSED, datasets and annotate ALL genes. This part is mainly required for biotype detection step
for ( dataset in names(ref_dataset.list) ) {
  
  ##### Convert data into a data frame to make the Ensembl ID and gene symbol matches (with merge function)
  data <- ref_dataset.list[[dataset]][["combined_data"]]
  data.df <- as.data.frame(cbind(rownames(data), data))
  colnames(data.df)[1] <- "ENSEMBL"

  ##### Get genes annotation and genomic locations
  edb <- eval(parse(text = paste0("EnsDb.Hsapiens.v", params$ensembl_version)))
  
  ##### Get keytypes for gene SYMBOL
  keys <- keys(edb, keytype="GENEID")
  
  ##### Get genes genomic coordiantes
  gene_info <- ensembldb::select(edb, keys=keys, columns=c("GENEID", "GENEBIOTYPE", "GENENAME", "SEQNAME", "GENESEQSTART", "GENESEQEND"), keytype="GENEID")
  names(gene_info) <- gsub("GENEID", "ENSEMBL", names(gene_info))
  names(gene_info) <- gsub("GENENAME", "SYMBOL", names(gene_info))
  
  ##### Limit genes annotation to those genes for which sample expression measurments are available
  gene_info <-  gene_info[ gene_info$ENSEMBL %in% data.df$ENSEMBL,  ]
  
  ##### Remove rows with duplicated ENSEMBL IDs
  gene_info = gene_info[!duplicated(gene_info$ENSEMBL),]
  rownames(gene_info) <- gene_info$ENSEMBL
  
  ##### Remove rows with duplicated gene symbols (Y_RNAs, SNORs, LINC0s etc)
  gene_info = gene_info[!duplicated(gene_info$SYMBOL),]
  
  ##### Add info about immune response markers
  gene_info.immune_markers <- merge(gene_info, ref_genes.list[["genes_immune"]]$immune_markers, by = "SYMBOL", all.x = TRUE)
  
  ##### Keep only immune response markers for which there is available annotation
  ref_genes.list[["genes_immune"]]$immune_markers <- ref_genes.list[["genes_immune"]]$immune_markers[ ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL %in% gene_info.immune_markers$SYMBOL, ]
  
  ##### Add info about immunogram genes
  if ( params$immunogram ) {
    gene_info.immunogram <- merge(gene_info, ref_genes.list[["genes_immune"]]$immunogram, by = "SYMBOL", all.x = TRUE)
    gene_info.immunogram <- gene_info.immunogram[!duplicated(gene_info.immunogram[,"ENSEMBL"]),]
    
    ##### Keep only immunogram genes for which there is available annotation
    ref_genes.list[["genes_immune"]]$immunogram <- ref_genes.list[["genes_immune"]]$immunogram[ ref_genes.list[["genes_immune"]]$immunogram$SYMBOL %in% gene_info.immunogram$SYMBOL, ]
    
    ##### Merge genes annotations for immunogram genes and immune markers
    gene_info <- merge( gene_info.immunogram, gene_info.immune_markers[ , c("ENSEMBL", "Immune_Cycle_Role") ], by = "ENSEMBL")
  } else {
    gene_info <- gene_info.immune_markers
  }
  
  ##### Merge genes genomic coordinates info with their annotation and expression data
  data.annot <- merge(gene_info, data.df, by = "ENSEMBL", all.x = FALSE)
  rownames(data.annot) <- data.annot$ENSEMBL
  
  ##### Get data matrix with gene symbols
  if ( params$immunogram ) {
    ref_dataset.list[[dataset]][["gene_annot_all"]] <- data.annot[, c("SYMBOL", "GENEBIOTYPE", "ENSEMBL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "CIC", "Immune_Cycle_Role")]
  } else {
    ref_dataset.list[[dataset]][["gene_annot_all"]] <- data.annot[, c("SYMBOL", "GENEBIOTYPE", "ENSEMBL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "Immune_Cycle_Role")]
  }
  
  ##### Save the combined expression matrix, genes list and associated targets into txt files
  write.table(prepare2write(ref_dataset.list[[dataset]][["combined_data"]]), file = paste0(results_dir, "/", sample_name, ".RNAseq_report.combined_data.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, append = FALSE )
  write.table(prepare2write(ref_dataset.list[[dataset]][["gene_annot_all"]]), file = paste0(results_dir, "/", sample_name, ".RNAseq_report.gene_annot_all.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, append = FALSE )
}

##### Clean the space
rm(data, target, data.df, edb, keys)
##### Loop through combined datasets and annotate genes
for ( dataset in names(ref_dataset.list) ) {
  
  ##### Convert data into a data frame to make the Ensembl ID and gene symbol matches (with merge function)
  data <- ref_dataset.list[[dataset]][["data_to_report"]]
  data.df <- as.data.frame(cbind(rownames(data), data))
  colnames(data.df)[1] <- "ENSEMBL"
  
  ##### Merge genes genomic coordinates info with their annotation and expression data
  data.annot <- merge(gene_info, data.df, by = "ENSEMBL", all.x = FALSE)
  
  ##### Keep only genes fo which gene symbol is available
  data.annot <- data.annot[!(is.na(data.annot$SYMBOL) | data.annot$SYMBOL==""), ]
  rownames(data.annot) <- data.annot$SYMBOL
  
  ##### Get data matrix with gene symbols
  ref_dataset.list[[dataset]][["data_to_report"]] <- apply(data.annot[, colnames(data)], 2, as.numeric)
  rownames(ref_dataset.list[[dataset]][["data_to_report"]]) <- data.annot$SYMBOL
  ref_dataset.list[[dataset]][["gene_annot"]] <- data.annot[, c("SYMBOL", "GENEBIOTYPE", "ENSEMBL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "Immune_Cycle_Role")]
  
  ##### Save the combined expression matrix, genes list and associated targets into txt files
  write.table(prepare2write(ref_dataset.list[[dataset]][["data_to_report"]]), file = paste0(results_dir, "/", sample_name, ".RNAseq_report.combined_data_processed.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, append = FALSE )
  write.table(prepare2write(toupper(rownames(ref_dataset.list[[dataset]][["data_to_report"]]))), file = paste0(results_dir, "/", sample_name, ".RNAseq_report.combined_data_processed.genes.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, append = FALSE )
  write.table(prepare2write(ref_dataset.list[[dataset]][["sample_annot"]]), file = paste0(results_dir, "/", sample_name, ".RNAseq_report.sample_annot.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, append = FALSE )
}

##### Clean the space
rm(data, data.df, gene_info)
##### Save the entire expression data for all genes measured in patient's sample with cancer genes annotaiton as a data table html file
##### Generate expression summary table for mutated genes
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]

##### Percentiles
genes.expr.perc <- exprTable( genes = rownames(data), keep_all = TRUE, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]], ext_links = TRUE, type = "perc", scaling = scaling)

##### Z-scores
genes.expr.z <- exprTable( genes = rownames(data), keep_all = TRUE, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]], ext_links = TRUE, type = "z", scaling = scaling)

##### Create directory for saving tables
exprTableDir <- paste(results_dir, "exprTables", sep = "/")
    
if ( !file.exists(exprTableDir) ) {
  dir.create(exprTableDir, recursive=TRUE)
}

##### Save the expression tables as html file
saveWidgetFix(widget=genes.expr.perc[[1]], file=paste(exprTableDir, "genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
saveWidgetFix(widget=genes.expr.z[[1]], file=paste(exprTableDir, "genes.expr.z.html", sep = "/"), selfcontained=TRUE)

##### Clean the space
rm(data, targets, genes.expr.z, genes.expr.perc)
##### Combine expression data with mutation and CN data if available
cn_data <- ref_genes.list[["purple"]]
expr_data <- ref_dataset.list[[dataset]][["data_to_report"]]
targets <- ref_dataset.list[[dataset]][["sample_annot"]]

##### ...percerntiles
expr_data.perc <- exprTable( genes = rownames(expr_data), keep_all = TRUE, data = expr_data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, type = "perc", scaling = scaling)[[2]]

expr_genes <- expr_data.perc$SYMBOL

##### Get the "Diff" (Patient vs [comp_cancer]) Z-scores using exprTable function
expr_data.z <- exprTable( genes = expr_genes, keep_all = TRUE, data = expr_data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, type = "z", scaling = scaling)[[2]]

##### Make sure the tables have the same genes order
expr_data.perc <- expr_data.perc[ expr_genes, ]

if ( comp_cancer_group != int_cancer_group ) {
  expr_data.perc <- expr_data.perc[, "Diff" ]
  expr_data.z <- expr_data.z[, "Diff" ]
} else {
  expr_data.perc <- expr_data.perc[, paste0( "Patient vs ", comp_cancer_group)]
  expr_data.z <- expr_data.z[, paste0( "Patient vs ", comp_cancer_group)]
}

names(expr_data.perc) <- expr_genes
names(expr_data.z) <- expr_genes

##### Calculate the mean CN for each gene
cn_data$MeanCopyNumber <- rowMeans(cbind(cn_data$MinCopyNumber, cn_data$MaxCopyNumber))
  
##### Deal with negative CN values
cn_data$MeanCopyNumber[ cn_data$MeanCopyNumber < 0 ] <- 0

##### Remove entries with missing gene symbol (mainly variants in intergenic regions)
cn_data <- cn_data[ cn_data$Gene %!in% "", ]

##### Keep only altered genes with CN values below loss threshold (default 5th percentile) and above gain threshold (default 95th percentile)
cn_data.all <- cn_data

##### Get the percentiles from from the CN values
cn_data.all.percent <- quantile(cn_data.all$MeanCopyNumber, probs = seq(0, 1, .05), na.rm = TRUE)

##### Keep only genes with available expression data
cn_data <- cn_data[ cn_data$Gene %in% names(expr_data.z), ]

##### Add mutation data if available
if ( !is.null(ref_genes.list[["pcgr"]]) ) {
  mut_data <- ref_genes.list[["pcgr"]]
  
  ##### Remove entries with missing gene symbol (mainly variants in intergenic regions)
  mut_data <- mut_data[ mut_data$SYMBOL %!in% "", ]

  ##### Prepare mutation data to include multiple mutations per gene
  ##### Initiate variable for the gene mutation status for each gene
  gene.mut <- as.matrix(rep("None", length(expr_data.z)))
  colnames(gene.mut) <- "Alterations"
  rownames(gene.mut) <- names(expr_data.z)

  for ( i in 1:nrow(gene.mut) ) {
    ##### Check if any mutations are reported for each gene
    if (  rownames(gene.mut)[i] %in% mut_data$SYMBOL ) {
    
      ##### Deal with multiple mutations per gene
      if ( length(mut_data[ mut_data$SYMBOL %in% rownames(gene.mut)[i],  ]$CONSEQUENCE) > 1 ) {
        gene.mut[ rownames(gene.mut)[i],"Alterations" ] <- "Mutation: multiple hits"
      } else {
        gene.mut[ rownames(gene.mut)[i],"Alterations" ] <- paste0("Mutation: ", mut_data[ mut_data$SYMBOL %in% rownames(gene.mut)[i],  ]$CONSEQUENCE)
      }
    }
  }

  ##### If there is no expression value for a specific gene than assume it's not expressed at all and assign the lowest value observed in that sample
  for ( gene in unique(mut_data$SYMBOL) ) {
    if ( gene %!in% rownames(gene.mut) ) {
      
      expr_data.perc <- c(expr_data.perc, min(expr_data.perc))
      names(expr_data.perc)[length(expr_data.perc)] <- gene
      
      expr_data.z <- c(expr_data.z, min(expr_data.z))
      names(expr_data.z)[length(expr_data.z)] <- gene
      
      ##### Deal with multiple mutations per gene
      if ( length(mut_data[ mut_data$SYMBOL %in% gene,  ]$CONSEQUENCE) > 1 ) {
        gene.mut <- rbind( gene.mut,  "multiple hits")
      } else {
        gene.mut <- rbind( gene.mut,  mut_data[ mut_data$SYMBOL %in% gene,  ]$CONSEQUENCE )
      }
      rownames(gene.mut)[nrow(gene.mut)] <- gene
    }
  }

  ##### Subset expression, mutation and copy-number data to include only overlapping genes
  genes.intersect <- intersect(intersect(rownames(gene.mut), cn_data$Gene), names(expr_data.perc))
  
  gene.mut.sub <- gene.mut[ rownames(gene.mut) %in% genes.intersect, ]
  cn_data.sub <- cn_data[ cn_data$Gene %in% genes.intersect, ]
  expr_data.perc.sub <- expr_data.perc[ names(expr_data.perc) %in% genes.intersect ]
  expr_data.z.sub <- expr_data.z[ names(expr_data.z) %in% genes.intersect ]
  
  ##### Make sure thay are all in the same order
  gene.mut.sub <- gene.mut.sub[ genes.intersect ]
  rownames(cn_data.sub) <- cn_data.sub$Gene
  cn_data.sub <- cn_data.sub[ genes.intersect,  ]
  expr_data.perc.sub <- expr_data.perc.sub[ genes.intersect  ]
  expr_data.z.sub <- expr_data.z.sub[ genes.intersect  ]
  
  ##### Prepare data frame
  cn_data.sub <- data.frame(names(expr_data.z.sub), cn_data.sub$MeanCopyNumber, expr_data.perc.sub, expr_data.z.sub, gene.mut.sub)
  colnames(cn_data.sub) <- c("Gene", "CN", "Perc_diff", "Z_score_diff", "Alterations")
  
} else {
  ##### Skip the step for processing mutation info and deal with expression and copy-number data
  ##### Subset expression and copy-number data to include only overlapping genes
  genes.intersect <- intersect(cn_data$Gene, names(expr_data.perc))
  
  cn_data.sub <- cn_data[ cn_data$Gene %in% genes.intersect, ]
  expr_data.perc.sub <- expr_data.perc[ names(expr_data.perc) %in% genes.intersect ]
  expr_data.z.sub <- expr_data.z[ names(expr_data.z) %in% genes.intersect ]
  
  ##### Make sure thay are all in the same order
  rownames(cn_data.sub) <- cn_data.sub$Gene
  cn_data.sub <- cn_data.sub[ genes.intersect,  ]
  expr_data.perc.sub <- expr_data.perc.sub[ genes.intersect  ]
  expr_data.z.sub <- expr_data.z.sub[ genes.intersect  ]
  
  ##### Prepare data frame
  cn_data.sub <- data.frame(names(expr_data.z.sub), cn_data.sub$MeanCopyNumber, expr_data.perc.sub, expr_data.z.sub)
  colnames(cn_data.sub) <- c("Gene", "CN", "Perc_diff", "Z_score_diff")
}

ref_dataset.list[[dataset]][["expr_mut_cn_data_all"]] <- cn_data.sub

##### Limit the data to include only cancer genes
cn_data.sub <- cn_data.sub[ cn_data.sub$Gene %in% rownames(ref_genes.list[["genes_cancer"]]), ]

##### Keep genes meeting the user-defined CN values thresholds
ref_dataset.list[[dataset]][["expr_mut_cn_data"]] <- cn_data.sub[ cn_data.sub$CN <= cn_bottom | cn_data.sub$CN >= cn_top, ]

##### Clean the space
rm(cn_data, cn_data.sub, expr_data, gene.mut, mut_data, targets, expr_data.z, expr_data.perc, expr_data.z.sub, expr_data.perc.sub, expr_genes, gene.mut.sub, genes.intersect)
suppressMessages(library(plotly))

##### Draw histogram of CN data
cn_dist_plot <- plot_ly(x = cn_data.all$MeanCopyNumber, type = 'histogram', name = "CN data", width = 800, height = 300) %>%
  
  ##### Add 5th percentile threshold
  add_lines(y = seq(0,1000, 100), x = rep(cn_data.all.percent[2],11), 
              line = list(color = "black", dash = "dash"), opacity = 0.4,
              name = "5th percentile", showlegend = TRUE) %>%
  
  ##### Add 50th percentile
  add_lines(y = seq(0,1000, 100), x = rep(cn_data.all.percent[11],11), 
              line = list(color = "black", dash = "dash"), opacity = 0.7,
              name = "50th percentile", showlegend = TRUE) %>%
  
  ##### Add 95th percentile threshold
  add_lines(y = seq(0,1000, 100), x = rep(cn_data.all.percent[20],11), 
              line = list(color = "black", dash = "dash"), opacity = 1,
              name = "95th percentile", showlegend = TRUE) %>%
  
  layout(xaxis = list( title = "CN values"), yaxis = list( title = "Frequency"), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F)

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())

##### Clean the space
rm(cn_data.all)
##### Flag known fusions based on info from Cancer Biomarkers database (CGI) (https://www.cancergenomeinterpreter.org/biomarkers)
known_translocations.CGI <- caner_genes_annot.list[["cancer_biomarkers_trans"]]
known_translocations.CGI$cancer_acronym <- gsub(";", ", ", known_translocations.CGI$cancer_acronym)
known_translocations.CGI$source <- gsub(";", ", ", known_translocations.CGI$source)
known_translocations.CGI$translocation <- gsub("__", "_", known_translocations.CGI$translocation)
  
##### Flag known fusions based on info from FusionGDB (https://ccsm.uth.edu/FusionGDB)
known_translocations.FusionGDB <- caner_genes_annot.list[["FusionGDB"]]
  
##### Merge info from both resources
known_translocations <- merge(known_translocations.FusionGDB, known_translocations.CGI, by.x = "FGname", by.y = "translocation", all = TRUE, sort=FALSE)
  
##### Extract gene pairs involved in reported gene fusions
trans.pairs <- as.data.frame(cbind( known_translocations$FGname, known_translocations$FGname ))
names(trans.pairs) <- c("geneA", "geneB")
trans.pairs$geneA <- sub("_.*", "", trans.pairs$geneA)
trans.pairs$geneB <- sub(".*_", "", trans.pairs$geneB)
known_translocations <- cbind(known_translocations, trans.pairs)
trans.pairs <- apply( trans.pairs , 1 , paste , collapse = "-" )
##### Read in the arriba fusion calls
arriba.fusions <- ref_genes.list[["arriba"]]
colnames(arriba.fusions) <- gsub("X.gene1", "geneA", colnames(arriba.fusions))
colnames(arriba.fusions) <- gsub("1", "A", colnames(arriba.fusions))
colnames(arriba.fusions) <- gsub("2", "B", colnames(arriba.fusions))

#####  Note the fusions order, which will be later required for imbedding Arriba plots from corresponding pdf booklet pages
arriba.fusions.order <- paste(arriba.fusions$geneA, arriba.fusions$geneB, sep="__")

##### Extract only those fusion genes that are in cancer genes list
arriba.cancer_genes <- data.frame()

for (row in 1:nrow(arriba.fusions)){
  if(arriba.fusions[row,"geneA"] %in% rownames(ref_genes.list[["genes_cancer"]]) | arriba.fusions[row,"geneB"] %in% rownames(ref_genes.list[["genes_cancer"]])) {
    
    ##### Creating a new dataframe for extracting arriba rows with cancer gene hits
    arriba.cancer_genes <- rbind(arriba.cancer_genes, data.frame(arriba.fusions[row,]))
  }
}

##### Add columns for info about reported fusions
fusions <- cbind(arriba.fusions, data.frame(matrix("", ncol = 5, nrow = nrow(arriba.fusions)), stringsAsFactors = FALSE))
colnames(fusions)[(ncol(fusions)-4):ncol(fusions)] <- c("FGID", "reported_fusion", "reported_fusion_geneA", "reported_fusion_geneB", "effector_gene")
  
##### Add annotations about known fusion events
##### Loop through all genes involved in deteced gene fusions (arriba results) and check which are already reported
for ( i in 1:nrow(fusions) ) {
  geneA <- as.character(fusions$geneA[i])
  geneB <- as.character(fusions$geneB[i])
          
  ##### First check if the exact reported gene pairs were detected by arriba
  if ( paste(geneA, geneB, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneA, geneB, sep="-")  ]
      
    fusions$reported_fusion_geneA[i] <- "Yes"
    fusions$reported_fusion_geneB[i] <- "Yes"
      
  } else if ( paste(geneB, geneA, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneB, geneA, sep="-")  ]
      
    fusions$reported_fusion_geneA[i] <- "Yes"
    fusions$reported_fusion_geneB[i] <- "Yes"
      
  ##### Now check if any ofthe arriba detected fusion genes are reported
  } else {
    fusions$reported_fusion[i] <- "-"
      
    ##### Check the Cancer Genome Interpreter (CGI) database first
    ##### Check arriba genes A and genes A in reported fusions
    if ( geneA %in% known_translocations$geneA ) {
       fusions$reported_fusion_geneA[i] <- "Yes"
        
    ##### Check arriba genes A and genes B in reported fusions
    } else if ( geneA %in% known_translocations$geneB ) {
      fusions$reported_fusion_geneA[i] <- "Yes"
    }
      
    ##### Check arriba genes B and genes A in reported fusions
    if ( geneB %in% known_translocations$geneA ) {
      fusions$reported_fusion_geneB[i] <- "Yes"
        
    ##### Check arriba genes B and genes B in reported fusions
    } else if ( geneB %in% known_translocations$geneB ) {
      fusions$reported_fusion_geneB[i] <- "Yes"
    }
      
    ##### Flag if any of the genes are effector gene
    if ( geneA %in% known_translocations$effector_gene  ) {
      fusions$effector_gene[i] <- geneA
    } else if ( geneB == known_translocations$effector_gene  ) {
      fusions$effector_gene[i] <- geneB
    }
  }
}

##### Sum split reads in gene A and B
fusions$split_reads <- fusions$split_readsA + fusions$split_readsB

##### Add column indicating fusions containing known cancer genes
fusions$fusions_cancer <- c(rep("-", nrow(fusions)))

if ( nrow(arriba.cancer_genes) > 0 ) {
  fusions$fusions_cancer[ fusions$geneA %in% arriba.cancer_genes$geneA ] <- "Yes"
  fusions$fusions_cancer[ fusions$geneB %in% arriba.cancer_genes$geneB ] <- "Yes"
}

##### Re-ordering arriba's results on the basis of Arriba's confidence, reported fusions and then read count values (first by split count and then paircount) and then involvment of cancer genes and reported one of the fusion genes
fusions <- fusions[ order(fusions$reported_fusion, fusions$split_reads, fusions$split_readsA, fusions$split_readsB, fusions$discordant_mates, fusions$fusions_cancer, fusions$reported_fusion_geneA, fusions$reported_fusion_geneB, decreasing = TRUE), ]
fusions <- fusions[order(factor(fusions$confidence, levels=c("high", "medium", "low"))), ]

##### Keep only key columns and add info about Arriba detected fusions and 
fusions <- fusions[ colnames(fusions) %in% c("geneA", "geneB", "breakpointA", "breakpointB", "siteA", "siteB", "type", "split_reads", "split_readsA", "split_readsB", "discordant_mates", "confidence", "FGID", "reported_fusion", "reported_fusion_geneA", "reported_fusion_geneB", "effector_gene", "fusions_cancer")]

##### Add column to flag fusions supported by WGS data (from MANTA), if available
fusions$geneA_dna_support <- "-"
fusions$geneB_dna_support <- "-"

if ( runPizzlyChunk || runDragenFusionChunk ) {
  fusions$Arriba <- c(rep("Yes", nrow(fusions)))
}

##### Clean the space and return output
rm(arriba.fusions, arriba.fusion.transcripts, arriba.cancer_genes, arriba.other_genes)
##### Read in the arriba fusion calls
dragen.fusions <- ref_genes.list[["dragenFusion"]]
colnames(dragen.fusions) <- gsub("gene1", "geneA", colnames(dragen.fusions))
colnames(dragen.fusions) <- gsub("1", "A", colnames(dragen.fusions))
colnames(dragen.fusions) <- gsub("2", "B", colnames(dragen.fusions))

##### Extract only those fusion genes that are in cancer genes list
dragen.cancer_genes <- data.frame()

for (row in 1:nrow(dragen.fusions)){
  if(dragen.fusions[row,"geneA"] %in% rownames(ref_genes.list[["genes_cancer"]]) | dragen.fusions[row,"geneB"] %in% rownames(ref_genes.list[["genes_cancer"]])) {
    
    ##### Creating a new dataframe for extracting dragen rows with cancer gene hits
    dragen.cancer_genes <- rbind(dragen.cancer_genes, data.frame(dragen.fusions[row,]))
  }
}

##### Add columns for info about reported fusions
dragen.fusions <- cbind(dragen.fusions, data.frame(matrix("", ncol = 5, nrow = nrow(dragen.fusions)), stringsAsFactors = FALSE))
colnames(dragen.fusions)[(ncol(dragen.fusions)-4):ncol(dragen.fusions)] <- c("FGID", "reported_fusion", "reported_fusion_geneA", "reported_fusion_geneB", "effector_gene")

##### Add annotations about known fusion events
##### Loop through all genes involved in deteced gene fusions (dragen results) and check which are already reported
for ( i in 1:nrow(dragen.fusions) ) {
  geneA <- as.character(dragen.fusions$geneA[i])
  geneB <- as.character(dragen.fusions$geneB[i])
          
  ##### First check if the exact reported gene pairs were detected by dragen
  if ( paste(geneA, geneB, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    dragen.fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    dragen.fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneA, geneB, sep="-")  ]
      
    dragen.fusions$reported_fusion_geneA[i] <- "Yes"
    dragen.fusions$reported_fusion_geneB[i] <- "Yes"
      
  } else if ( paste(geneB, geneA, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    dragen.fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    dragen.fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneB, geneA, sep="-")  ]
      
    dragen.fusions$reported_fusion_geneA[i] <- "Yes"
    dragen.fusions$reported_fusion_geneB[i] <- "Yes"
      
  ##### Now check if any of the dragen detected fusion genes are reported
  } else {
    dragen.fusions$reported_fusion[i] <- "-"
      
    ##### Check the Cancer Genome Interpreter (CGI) database first
    ##### Check dragen genes A and genes B in reported fusions
    if ( geneA %in% known_translocations$geneA ) {
       dragen.fusions$reported_fusion_geneA[i] <- "Yes"
        
    ##### Check dragen genes A and genes B in reported fusions
    } else if ( geneA %in% known_translocations$geneB ) {
      dragen.fusions$reported_fusion_geneA[i] <- "Yes"
    }
      
    ##### Check dragen genes B and genes A in reported fusions
    if ( geneB %in% known_translocations$geneA ) {
      dragen.fusions$reported_fusion_geneB[i] <- "Yes"
        
    ##### Check dragen genes B and genes A in reported fusions
    } else if ( geneB %in% known_translocations$geneB ) {
      dragen.fusions$reported_fusion_geneB[i] <- "Yes"
    }
      
    ##### Flag if any of the genes are effector gene
    if ( geneA %in% known_translocations$effector_gene  ) {
      dragen.fusions$effector_gene[i] <- geneA
    } else if ( geneB == known_translocations$effector_gene  ) {
      dragen.fusions$effector_gene[i] <- geneB
    }
  }
}

##### Add column indicating fusions containing known cancer genes
dragen.fusions$fusions_cancer <- c(rep("-", nrow(dragen.fusions)))

if ( nrow(dragen.cancer_genes) > 0 ) {
  dragen.fusions$fusions_cancer[ dragen.fusions$geneA %in% dragen.cancer_genes$geneA ] <- "Yes"
  dragen.fusions$fusions_cancer[ dragen.fusions$geneB %in% dragen.cancer_genes$geneB ] <- "Yes"
}

##### Re-ordering dragen's results on the basis of Dragen's confidence, reported fusions and then score (as Dragen doesn't includes split count and paircount info) and then involvment of cancer genes and reported one of the fusion genes
dragen.fusions <- dragen.fusions[ order(dragen.fusions$reported_fusion, dragen.fusions$Score, dragen.fusions$fusions_cancer, dragen.fusions$reported_fusion_geneA, dragen.fusions$reported_fusion_geneB, decreasing = TRUE), ]
#dragen.fusions <- dragen.fusions[order(factor(dragen.fusions$confidence, levels=c("high", "medium", "low"))), ]

##### Keep only key columns
dragen.fusions <- dragen.fusions[ colnames(dragen.fusions) %in% c("geneA", "geneB", "Score", "LeftBreakpoint", "RightBreakpoint", "GeneALocation", "GeneBLocation", "NumSplitReads", "NumSoftClippedReads", "FGID", "reported_fusion", "reported_fusion_geneA", "reported_fusion_geneB", "effector_gene", "fusions_cancer")]

##### Add column to flag fusions supported by WGS data (from MANTA), if available
dragen.fusions$geneA_dna_support <- "-"
dragen.fusions$geneB_dna_support <- "-"

##### Add results from Arriba
if ( runArribaChunk ) {
  
  #####  Dragen's fusion format version 3.9.3
  if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {

    ##### Add column with Dragen fusions
    dragen.fusions$fusion <- paste(dragen.fusions$geneA, dragen.fusions$geneB, sep="__")
    fusions$Dragen <- c(rep("-", nrow(fusions)))
    fusions$split_reads <- fusions$split_readsA + fusions$split_readsB
    fusions$soft_clipped_reads <- c(rep("-", nrow(fusions)))
    fusions$score <- c(rep("-", nrow(fusions)))
    
    ##### Re-order columns
    fusions <- fusions %>% dplyr::relocate(split_reads, .before = split_readsA)
    fusions <- fusions %>% dplyr::relocate(soft_clipped_reads, .before = confidence)
    fusions <- fusions %>% dplyr::relocate(score, .before = FGID)
    
    ##### Loop through Dragen results, mark fusions detected by both tools. For those detected only by Dragen adapt results format to Arriba results
    for ( i in 1:nrow(dragen.fusions) ) {
      
      if ( !is.na(match(dragen.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__"))) ) {
        fusions$Dragen[ match(dragen.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__")) ] <- "Yes"
        dragen.fusions[ i, ] <-  rep("-", ncol(dragen.fusions))
      } else {
        fusions <- rbind(fusions, data.frame(geneA=dragen.fusions$geneA[i],geneB=dragen.fusions$geneB[i], breakpointA=dragen.fusions$LeftBreakpoint[i], breakpointB=dragen.fusions$RightBreakpoint[i], siteA=dragen.fusions$GeneALocation[i], siteB=dragen.fusions$GeneBLocation[i], type="-", split_reads=dragen.fusions$NumSplitReads[i], split_readsA="-", split_readsB="-", discordant_mates="-", soft_clipped_reads=dragen.fusions$NumSoftClippedReads[i], confidence="-", score=dragen.fusions$Score[i], FGID=dragen.fusions$FGID[i], reported_fusion=dragen.fusions$reported_fusion[i], reported_fusion_geneA=dragen.fusions$reported_fusion_geneA[i], reported_fusion_geneB=dragen.fusions$reported_fusion_geneB[i], effector_gene=dragen.fusions$effector_gene[i], fusions_cancer=dragen.fusions$fusions_cancer[i], geneA_dna_support="-", geneB_dna_support="-", Arriba="-", Dragen="Yes" ))
      }
    }
  
  #####  Dragen's fusion format prior to version 3.9.3
  } else {
    
    ##### Add column with Dragen fusions
    dragen.fusions$fusion <- paste(dragen.fusions$geneA, dragen.fusions$geneB, sep="__")
    fusions$Dragen <- c(rep("-", nrow(fusions)))
    fusions$split_reads <- fusions$split_readsA + fusions$split_readsB
    fusions$score <- c(rep("-", nrow(fusions)))
    
    ##### Re-order columns
    fusions <- fusions %>% dplyr::relocate(split_reads, .before = split_readsA)
    fusions <- fusions %>% dplyr::relocate(score, .before = FGID)
      
    ##### Loop through Dragen results, mark fusions detected by both tools. For those detected only by Dragen adapt results format to Arriba results
    for ( i in 1:nrow(dragen.fusions) ) {
        
      if ( !is.na(match(dragen.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__"))) ) {
        fusions$Dragen[ match(dragen.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__")) ] <- "Yes"
        dragen.fusions[ i, ] <-  rep("-", ncol(dragen.fusions))
      } else {
        fusions <- rbind(fusions, data.frame(geneA=dragen.fusions$geneA[i],geneB=dragen.fusions$geneB[i], breakpointA=dragen.fusions$LeftBreakpoint[i], breakpointB=dragen.fusions$RightBreakpoint[i], siteA="-", siteB="-", type="-", split_reads="-", split_readsA="-", split_readsB="-", discordant_mates="-", confidence="-", score=dragen.fusions$Score[i], FGID=dragen.fusions$FGID[i], reported_fusion=dragen.fusions$reported_fusion[i], reported_fusion_geneA=dragen.fusions$reported_fusion_geneA[i], reported_fusion_geneB=dragen.fusions$reported_fusion_geneB[i], effector_gene=dragen.fusions$effector_gene[i], fusions_cancer=dragen.fusions$fusions_cancer[i], geneA_dna_support="-", geneB_dna_support="-", Arriba="-", Dragen="Yes" ))
      }
    }
  }
  
##### Otherwise add empty columns expected from Aribba results
} else {
  fusions <- dragen.fusions
  
  #####  Dragen's fusion format version 3.9.3
  if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {
    
    ##### Rename columns
    names(fusions) <- gsub("LeftBreakpoint", "breakpointA", names(fusions))
    names(fusions) <- gsub("RightBreakpoint", "breakpointB", names(fusions))
    names(fusions) <- gsub("GeneALocation", "siteA", names(fusions))
    names(fusions) <- gsub("GeneBLocation", "siteB", names(fusions))
    names(fusions) <- gsub("NumSplitReads", "split_reads", names(fusions))
    names(fusions) <- gsub("NumSoftClippedReads", "soft_clipped_reads", names(fusions))
    names(fusions) <- gsub("Score", "score", names(fusions))
  
  #####  Dragen's fusion format prior to version 3.9.3
  } else {
    
    ##### Rename columns
    names(fusions) <- gsub("LeftBreakpoint", "breakpointA", names(fusions))
    names(fusions) <- gsub("RightBreakpoint", "breakpointB", names(fusions))
    names(fusions) <- gsub("Score", "score", names(fusions))
  }
}

##### Clean the space and return output
rm(dragen.fusion.transcripts, dragen.cancer_genes, dragen.other_genes)
##### Read in the pizzly fusion calls
pizzly.fusion.candidates <- ref_genes.list[["pizzly"]]

##### Extract only those fusion genes that are in cancer genes list
pizzly.cancer_genes <- data.frame()

for (row in 1:nrow(pizzly.fusion.candidates)){
  if(pizzly.fusion.candidates[row,"geneA.name"] %in% rownames(ref_genes.list[["genes_cancer"]]) | pizzly.fusion.candidates[row,"geneB.name"] %in% rownames(ref_genes.list[["genes_cancer"]])) {
    
    ##### Creating a new dataframe for extracting pizzly rows with cancer gene hits
    pizzly.cancer_genes <- rbind(pizzly.cancer_genes, data.frame(pizzly.fusion.candidates[row,]))
  }
}

##### Extracting rows from pizzly results that are not cancer genes list
pizzly.other_genes <- pizzly.fusion.candidates[ rownames(pizzly.fusion.candidates) %!in% rownames(pizzly.cancer_genes), ]
  
##### Combing all the three above sorted dataframes
pizzly.fusions <- rbind(pizzly.cancer_genes, pizzly.other_genes)
  
##### Flag known fusions based on info from Cancer Biomarkers database (CGI) and FusionGDB (https://ccsm.uth.edu/FusionGDB)
##### Add columns for info about reported fusions
pizzly.fusions <- cbind(pizzly.fusions, data.frame(matrix("", ncol = 5, nrow = nrow(pizzly.fusions)), stringsAsFactors = FALSE))
colnames(pizzly.fusions)[(ncol(pizzly.fusions)-4):ncol(pizzly.fusions)] <- c("FGID", "reported_fusion", "reported_fusion_geneA", "reported_fusion_geneB", "effector_gene")
  
##### Add annotations about known fusion events
##### Loop through all genes involved in deteced gene fusions (pizzly results) and check which are already reported
for ( i in 1:nrow(pizzly.fusions) ) {
  geneA <- as.character(pizzly.fusions$geneA.name[i])
  geneB <- as.character(pizzly.fusions$geneB.name[i])
          
  ##### First check if the exact reported gene pairs were detected by pizzly
  if ( paste(geneA, geneB, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    pizzly.fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    pizzly.fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneA, geneB, sep="-")  ]
      
    pizzly.fusions$reported_fusion_geneA[i] <- "Yes"
    pizzly.fusions$reported_fusion_geneB[i] <- "Yes"
      
  } else if ( paste(geneB, geneA, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    pizzly.fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    pizzly.fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneB, geneA, sep="-")  ]
      
    pizzly.fusions$reported_fusion_geneA[i] <- "Yes"
    pizzly.fusions$reported_fusion_geneB[i] <- "Yes"
      
  ##### Now check if any ofthe pizzly detected fusion genes are reported
  } else {
    pizzly.fusions$reported_fusion[i] <- "-"
      
    ##### Check the Cancer Genome Interpreter (CGI) database first
    ##### Check pizzly genes A and genes A in reported fusions
    if ( geneA %in% known_translocations$geneA ) {
       pizzly.fusions$reported_fusion_geneA[i] <- "Yes"
        
    ##### Check pizzly genes A and genes B in reported fusions
    } else if ( geneA %in% known_translocations$geneB ) {
      pizzly.fusions$reported_fusion_geneA[i] <- "Yes"
    }
      
    ##### Check pizzly genes B and genes A in reported fusions
    if ( geneB %in% known_translocations$geneA ) {
      pizzly.fusions$reported_fusion_geneB[i] <- "Yes"
        
    ##### Check pizzly genes B and genes B in reported fusions
    } else if ( geneB %in% known_translocations$geneB ) {
      pizzly.fusions$reported_fusion_geneB[i] <- "Yes"
    }
      
    ##### Flag if any of the genes are effector gene
    if ( geneA %in% known_translocations$effector_gene  ) {
      pizzly.fusions$effector_gene[i] <- geneA
    } else if ( geneB == known_translocations$effector_gene  ) {
      pizzly.fusions$effector_gene[i] <- geneB
    }
  }
}
  
##### Add column indicating fusions containing known cancer genes
pizzly.fusions$fusions_cancer <- c(rep("-", nrow(pizzly.fusions)))

if ( nrow(pizzly.cancer_genes) > 0 ) {
  pizzly.fusions$fusions_cancer[ pizzly.fusions$geneA.name %in% pizzly.cancer_genes$geneA.name ] <- "Yes"
  pizzly.fusions$fusions_cancer[ pizzly.fusions$geneB.name %in% pizzly.cancer_genes$geneB.name ] <- "Yes"
}

##### Re-order fusion genes based on the reported fusions column
pizzly.fusions <- pizzly.fusions[ order(pizzly.fusions$reported_fusion, pizzly.fusions$splitcount, pizzly.fusions$paircount, pizzly.fusions$fusions_cancer, pizzly.fusions$reported_fusion_geneA, pizzly.fusions$reported_fusion_geneB, decreasing = TRUE), ]

##### Rename columns to match Arriba results
colnames(pizzly.fusions) <- gsub("geneA.name", "geneA", colnames(pizzly.fusions))
colnames(pizzly.fusions) <- gsub("geneB.name", "geneB", colnames(pizzly.fusions))
colnames(pizzly.fusions) <- gsub("paircount", "discordant_mates", colnames(pizzly.fusions))
colnames(pizzly.fusions) <- gsub("splitcount", "split_reads", colnames(pizzly.fusions))
pizzly.fusions <- pizzly.fusions[ colnames(pizzly.fusions) %!in% c("geneA.id", "geneB.id", "transcripts.list")]

##### Add results from Arriba
if ( runArribaChunk ) {
  
  ##### Add column with pizzly fusions
  pizzly.fusions$fusion <- paste(pizzly.fusions$geneA, pizzly.fusions$geneB, sep="__")
  fusions$Pizzly <-  c(rep("-", nrow(fusions)))
  
  ##### Loop through Pizzly results, mark fusions detected by both tools. For those detected only by pizzly adapt results format to Arriba results
  for ( i in 1:nrow(pizzly.fusions) ) {
    
    if ( !is.na(match(pizzly.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__"))) ) {
      fusions$Pizzly[ match(pizzly.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__")) ] <- "Yes"
      pizzly.fusions[ i, ] <-  rep("-", ncol(pizzly.fusions))
    } else {
      fusions <- rbind(fusions, data.frame(geneA=pizzly.fusions$geneA[i],geneB=pizzly.fusions$geneB[i], breakpointA="-", breakpointB="-", siteA="-", siteB="-", type="-", split_reads=pizzly.fusions$split_reads[i], split_readsA="-", split_readsB="-", discordant_mates=pizzly.fusions$discordant_mates[i], confidence="-", FGID=pizzly.fusions$FGID[i], reported_fusion=pizzly.fusions$reported_fusion[i], reported_fusion_geneA=pizzly.fusions$reported_fusion_geneA[i], reported_fusion_geneB=pizzly.fusions$reported_fusion_geneB[i], effector_gene=pizzly.fusions$effector_gene[i], fusions_cancer=pizzly.fusions$fusions_cancer[i], geneA_dna_support="-", geneB_dna_support="-", Arriba="-", Pizzly="Yes" ))
    }
  }
##### Otherwise add empty columns expected from Aribba results
} else {
  fusions <- pizzly.fusions
}

##### Add column to flag fusions supported by WGS data (from MANTA), if available
fusions$geneA_dna_support <- "-"
fusions$geneB_dna_support <- "-"

##### Clean the space and return output
rm(pizzly.fusions, pizzly.fusion.transcripts, pizzly.fusion.candidates, known_translocations.CGI, known_translocations.FusionGDB, pizzly.cancer_genes, pizzly.other_genes, trans.pairs)
##### Annotate fusion genes
##### Get data to annotate fusion genes
fusion_genes_annot <- ref_dataset.list[[dataset]][["gene_annot_all"]][ , c("ENSEMBL", "SYMBOL", "SEQNAME", "GENESEQSTART", "GENESEQEND") ]

fusions.annot <- fusions
fusions.annot$order <- 1:nrow(fusions.annot)

##### Get genomic info for fusions genes
fusion_annot1 <- merge(fusion_genes_annot, fusions.annot[ , c("order","geneA")], by = 2, sort=FALSE, all.y = TRUE)
fusion_annot1 <- fusion_annot1[ order(fusion_annot1$order), ]
fusion_annot2 <- merge(fusion_genes_annot, fusions.annot[ , c("order","geneB")], by = 2, sort=FALSE, all.y = TRUE)
fusion_annot2 <- fusion_annot2[ order(fusion_annot2$order), ]

##### Dragen + Arriba
if ( runDragenFusionChunk && runArribaChunk ) {
  fusion_annot <- cbind(fusion_annot1, fusion_annot2, fusions.annot[, c("score", "breakpointA", "breakpointB", "discordant_mates", "split_reads",  "split_readsA", "split_readsB", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB")])
  
##### Arriba / Arriba + Pizzly
} else if ( runArribaChunk ) {
  fusion_annot <- cbind(fusion_annot1, fusion_annot2, fusions.annot[, c("breakpointA", "breakpointB", "discordant_mates", "split_reads", "split_readsA", "split_readsB", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB")])
  
##### Dragen only
} else if ( runDragenFusionChunk ) {
  
  #####  Dragen's fusion format version 3.9.3
  if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {
    fusion_annot <- cbind(fusion_annot1, fusion_annot2, fusions.annot[, c("score", "breakpointA", "breakpointB", "split_reads", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB")])
    
  #####  Dragen's fusion format prior to version 3.9.3
  } else {
    fusion_annot <- cbind(fusion_annot1, fusion_annot2, fusions.annot[, c("score", "breakpointA", "breakpointB", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB")])
  }
  
##### Pizzly only
} else {
  fusion_annot <- cbind(fusion_annot1, fusion_annot2, fusions.annot[, c("split_reads", "discordant_mates", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB")])
}

##### Add column to flag fusions supported by WGS data (from MANTA), if available
fusion_annot$geneA_dna_support <- "-"
fusion_annot$geneB_dna_support <- "-"

colnames(fusion_annot) = make.names(colnames(fusion_annot), unique=TRUE)

##### Remove entries with missing annotation
fusion_annot <- fusion_annot[complete.cases(fusion_annot), ]

##### Clean the space
rm(fusion_annot1, fusion_annot2, fusions.annot, fusion_genes_annot)
##### Compare PIZZY and MANTA called gene fusion events
##### Add row for gene fusion events so that there is one row per gene
manta_sv <- ref_genes.list[["manta"]]
manta_sv$"Fusion genes" <- manta_sv$Gene

i <- 1
while ( i <= nrow(manta_sv) ) {
  if ( length(strsplit(manta_sv$Gene[i], split='&', fixed=TRUE)[[1]]) > 1 ) {
     
    ##### Insert new row for events involving two genes
    manta_sv <- tibble::add_row(manta_sv, .after = i)
    manta_sv[i+1, ] <- manta_sv[i, ]
    manta_sv$Gene[i] <- strsplit(manta_sv$Gene[i], split='&', fixed=TRUE)[[1]][1]
    manta_sv$Gene[i+1] <- strsplit(manta_sv$Gene[i+1], split='&', fixed=TRUE)[[1]][2]
    
    i <- i + 2
    
  } else {
    manta_sv$"Fusion genes"[i] <- ""
    i <- i + 1
  }
}

##### Compare fusion genes called by PIZZLy and MANTA
##### First limit MANTA output to fusions only
if ( runFusionChunk ) {
  manta_fusions <- unique(manta_sv[ grep("&", manta_sv$"Fusion genes"),  ]$Gene)
  manta_fusions <- manta_fusions[ manta_fusions %in% unique(c(as.vector(fusions$geneA), as.vector(fusions$geneB))) ]
    
  ##### Flag fusions that were also reported in MANTA
  if ( length(manta_fusions) > 0 ) {
    fusions$geneA_dna_support[ sort( match( manta_fusions , fusions$geneA ), na.last = NA ) ] <- "Yes"
    fusions$geneB_dna_support[ sort( match( manta_fusions , fusions$geneB ), na.last = NA ) ] <- "Yes"
      
    fusion_annot$geneA_dna_support[ sort( match( manta_fusions , fusion_annot$SYMBOL ), na.last = NA ) ] <- "Yes"
    fusion_annot$geneB_dna_support[ sort( match( manta_fusions , fusion_annot$SYMBOL.1 ), na.last = NA ) ] <- "Yes"
  
    ##### Re-order fusion dataframe with MANTA supporting fusions on top
    if ( runArribaChunk ) {
      idx <- order(fusions$geneA_dna_support, fusions$geneB_dna_support, fusions$Arriba, fusions$reported_fusion, decreasing = TRUE)
    } else {
      idx <- order(fusions$geneA_dna_support, fusions$geneB_dna_support, fusions$reported_fusion, decreasing = TRUE)
    }
    
    fusions <- fusions[ idx, ]
    fusion_annot <- fusion_annot[ idx, ]
  }
}

##### Remove entries with missing annotation
fusion_annot <- fusion_annot[complete.cases(fusion_annot), ]

##### Clean the space and return output
rm(manta_fusions)
##### Filter out fusions that are with < 2 split reads and < 2 pair reads and are not supported by genomic data, are not reported and don't involve cancer genes

##### Dragen + Arriba
if ( runDragenFusionChunk && runArribaChunk ) {
  fusions <- fusions %>% dplyr::filter( split_reads > 1 | discordant_mates > 1 | score > 0 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  fusion_annot <- fusion_annot %>% dplyr::filter( split_reads > 1 | discordant_mates > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")

  ##### Arriba / Arriba + Pizzly
} else if ( runArribaChunk ) {
  fusions <- fusions %>% dplyr::filter( split_reads > 1 | discordant_mates > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  fusion_annot <- fusion_annot %>% dplyr::filter( split_reads > 1 | discordant_mates > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  
##### Dragen only
} else if ( runDragenFusionChunk ) {
  
  ##### For Dragen , this filtering is not changing the results. We'll review the "Score" value again once we start to regularly produce the RNAsum report for Dragen results
  #####  Dragen's fusion format version 3.9.3
  if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(fusions)) ) {
    fusions <- fusions %>% dplyr::filter( split_reads > 1 | score > 0 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
    fusion_annot <- fusion_annot %>% dplyr::filter( score > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  
  #####  Dragen's fusion format prior to version 3.9.3
  } else {
    fusions <- fusions %>% dplyr::filter( score > 0 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
    fusion_annot <- fusion_annot %>% dplyr::filter( score > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  }
  
##### Pizzly only
} else {
  fusions <- fusions %>% dplyr::filter(split_reads > 1 | discordant_mates > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  fusion_annot <- fusion_annot %>% dplyr::filter(split_reads > 1 | discordant_mates > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
}
##### Indicate which fusions have genomic coordinates and can be presented on circos plot
##### Take into account only reported fusions or those with both genes genes supported by DNA
if ( runSVsChunk ) {
  fusion_annot_top <- fusion_annot[ fusion_annot$reported_fusion == "Yes" | fusion_annot$geneA_dna_support == "Yes" | fusion_annot$geneB_dna_support == "Yes" , ]
} else {
  fusion_annot_top <- fusion_annot[ fusion_annot$reported_fusion == "Yes" , ]
}

fusions$circos <- "-"
fusions$circos[ paste(fusions$geneA, fusions$geneB, sep="-") %in% paste(fusion_annot_top$SYMBOL, fusion_annot_top$SYMBOL.1, sep="-") ] <- "Yes"
##### Extract data for Immunogram genes
data <- ref_dataset.list[[dataset]][["data_to_report"]]
data <- data[ rownames(data) %in% ref_genes.list[["genes_immune"]]$immunogram$SYMBOL, ]

##### Create lists with caulcuation results for each individual Cancer-Immunity Cycle (CIC) step
CIC.list <- vector("list", length(unique(ref_genes.list[["genes_immune"]]$immunogram$CIC)))
names(CIC.list) <- unique(ref_genes.list[["genes_immune"]]$immunogram$CIC)
  
##### Calculate average expression for each Cancer-Immunity Cycle (CIC) step
for ( cic_step in unique(ref_genes.list[["genes_immune"]]$immunogram$CIC) ) {
  
  genes <- ref_genes.list[["genes_immune"]]$immunogram$SYMBOL[ ref_genes.list[["genes_immune"]]$immunogram$CIC %in% cic_step ]
  data.sub <- data[ rownames(data) %in% genes, ]
  CIC.list[[cic_step]] <- colMeans(data.sub)
}

##### Conver the list into dataframe
ref_genes.list[["genes_immune"]]$immunogram.df <- t(data.frame(matrix(unlist(CIC.list), nrow=length(CIC.list), byrow=T),stringsAsFactors=FALSE))
colnames(ref_genes.list[["genes_immune"]]$immunogram.df) <- names(CIC.list)
rownames(ref_genes.list[["genes_immune"]]$immunogram.df) <- colnames(data.sub)
##### Summarise the reference cohorts samples
target <- ref_dataset.list[[dataset]][["sample_annot"]]
ref_ext_cancer <- table(target$Target)[names(table(target$Target))==ext_cancer_group]
ref_int_cancer <- table(target$Target)[names(table(target$Target))==int_cancer_group]

if ( !is.null(add_cancer_group) ) {
  ref_ext_cancer <- table(target$Target)[names(table(target$Target))==c(ext_cancer_group)] +  table(target$Target)[names(table(target$Target))==c(add_cancer_group)]
}
##### Update altered genes in ...
##### ...gene fusion section: Include only those which are DNA-supported (see Structural variants section) or reported in FusionGDB 
if ( runFusionChunk ) {
  ref_genes.list[["summary"]]$Fusion <- fusions[ fusions$reported_fusion == "Yes" | fusions$geneA_dna_support == "Yes" | fusions$geneB_dna_support == "Yes" , ]
  ref_genes.list[["summary"]]$Fusion <- unique(c(as.character(ref_genes.list[["summary"]]$Fusion$geneA), as.character(ref_genes.list[["summary"]]$Fusion$geneB)))
} else {
  ref_genes.list[["summary"]]$Fusion <- NULL
}

##### ...copy-number (CN) section: include only genes with CN values > 3 or < 0.5
if ( runPurpleChunk ) {
  
  #### Keep only genes with user-define CN values
  ref_genes.list[["summary"]]$CN <- ref_dataset.list[[dataset]][["expr_mut_cn_data"]]
  ref_genes.list[["summary"]]$CN <- as.character(ref_genes.list[["summary"]]$CN[ ref_genes.list[["summary"]]$CN$CN <= cn_bottom | ref_genes.list[["summary"]]$CN$CN >= cn_top,  ]$Gene)
}

##### ...immune markers section: include only genes with available annotation
if ( params$immunogram ) {
  ref_genes.list[["summary"]]$Immune <- unique(c(ref_genes.list[["genes_immune"]]$immunogram$SYMBOL, ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL))
} else {
  ref_genes.list[["summary"]]$Immune <- unique(ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL)
}
suppressMessages(library(plotly))

##### Prepare dataframe for Sunburst plot summarising all altered genes
alt_genes.all.list <- ref_genes.list[["summary"]]

##### Don't show cancer genes list (too long)
alt_genes.all.list$Cancer <- NULL

##### Note all altered genes
alt_genes.all <- sort(table(unlist(alt_genes.all.list)), decreasing = TRUE)

for ( alt in names(alt_genes.all.list) ) {

  ##### Add only alteration type which has at least one alteration detected
  if ( length(alt_genes.all[ names(alt_genes.all) %in% alt_genes.all.list[[ alt ]] ])  > 0 ) {
    alt_genes.all.list[[ alt ]] <- alt_genes.all[ names(alt_genes.all) %in% alt_genes.all.list[[ alt ]] ]
  } else {
    alt_genes.all.list[[ alt ]] <- NULL
  }
}

sunburst.all.df <- data.frame(ids = names(alt_genes.all.list),
  labels = names(alt_genes.all.list),
  parents = rep("", length(alt_genes.all.list)),
  values = as.numeric(lengths(alt_genes.all.list))/100,
  stringsAsFactors = FALSE
)

for ( alt in names(alt_genes.all.list) ) {
  
  ##### Add only alteration type which has at least one alteration detected
  if ( length(alt_genes.all[ names(alt_genes.all) %in% names(alt_genes.all.list[[ alt ]]) ])  > 0 ) {
      
    sunburst.all.df <- rbind( sunburst.all.df , data.frame(ids = paste( alt, names(alt_genes.all.list[[ alt ]]), sep = " - "), 
          labels = paste0("\t\t", names(alt_genes.all.list[[ alt ]]), "\t\t"),
          parents = rep( alt , length(alt_genes.all.list[[ alt ]])),
          values = as.numeric(alt_genes.all.list[[ alt ]])
          ) )
  }
}

sunburst_plot <- NULL
sunburst_plot[[1]] <- plot_ly(sunburst.all.df, ids = ~ids, labels = ~labels, parents = ~parents, values = ~values, type = 'sunburst', width = 600, height = 600)

##### Now include only Identify genes that appear in more then two lists
alt_genes.list <- alt_genes.all.list
alt_genes <- alt_genes.all[ alt_genes.all > 1 ]

for ( alt in names(alt_genes.list) ) {

  ##### Add only alteration type which has at least one alteration detected
  if ( length(alt_genes[ names(alt_genes) %in% names(alt_genes.list[[ alt ]]) ])  > 0 ) {
    alt_genes.list[[ alt ]] <- alt_genes[ names(alt_genes) %in% names(alt_genes.list[[ alt ]]) ]
  } else {
    alt_genes.list[[ alt ]] <- NULL
  }
}
  
sunburst.df <- data.frame(ids = names(alt_genes.list),
  labels = names(alt_genes.list),
  parents = rep("", length(alt_genes.list)),
  values = as.numeric(lengths(alt_genes.list))/100,
  stringsAsFactors = FALSE
)
  
for ( alt in names(alt_genes.list) ) {
  sunburst.df <- rbind( sunburst.df , data.frame(ids = paste( alt, names(alt_genes.list[[ alt ]]), sep = " - "), 
        labels = paste0("\t\t", names(alt_genes.list[[ alt ]]), "\t\t"),
        parents = rep( alt , length(alt_genes.list[[ alt ]])),
        values = as.numeric(alt_genes.list[[ alt ]])
        ) )
}

if ( nrow(sunburst.df) > 0 ) {
  sunburst_plot[[2]] <- plot_ly(sunburst.df, ids = ~ids, labels = ~labels, parents = ~parents, values = ~values, type = 'sunburst', width = 600, height = 600)
} else {
  sunburst_plot[[2]] <- NA
}

##### Create directory for the plots
summaryPlotsDir <- paste(results_dir, "summaryPlots", sep = "/")
if ( !file.exists(summaryPlotsDir) ) {
  dir.create(summaryPlotsDir, recursive=TRUE)
}
  
##### Save interactive plot as html file
saveWidgetFix(sunburst_plot[[1]], file = paste(summaryPlotsDir, "sunburst_plot_all.html", sep = "/"))

if ( !is.na(sunburst_plot[[2]]) ) {
  saveWidgetFix(sunburst_plot[[2]], file = paste(summaryPlotsDir, "sunburst_plot.html", sep = "/"))
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())

##### Clean the space
rm(sunburst.all.df, sunburst.df)
##### Prepare dataframe for a table summarising all altered genes
##### Create lists with alterations detected in each gene
genes.list <- names(alt_genes.all)
summary_table.list <- vector("list", length(genes.list))
names(summary_table.list) <- genes.list
  
##### Go through all alterated genes and note the alterations types
for ( gene in names(alt_genes.all) ) {
  for ( alt in names(alt_genes.all.list) ) {
    if ( gene %in% names(alt_genes.all.list[[ alt ]])  ) {
      summary_table.list[[ gene ]] <- c( summary_table.list[[ gene ]], "Yes" )
    } else {
      summary_table.list[[ gene ]] <- c( summary_table.list[[ gene ]], "-" )
    }
  }
  
  ##### Add links to external resources
  ##### Provide link to VICC meta-knowledgebase ( https://search.cancervariants.org )
  summary_table.list[[ gene ]] <- c( summary_table.list[[ gene ]], paste0("<a href='https://search.cancervariants.org/#", gene, "' target='_blank'>VICC</a>"))
      
  ##### Provide link to OncoKB
  if ( gene %in% rownames(ref_genes.list[["genes_oncokb"]]) ) {
    if ( ref_genes.list[["genes_oncokb"]][  gene, "OncoKB"] == "Yes" ) {
          
      summary_table.list[[ gene ]][ length(summary_table.list[[ gene ]])] <- paste( summary_table.list[[ gene ]][ length(summary_table.list[[ gene ]])] , paste0("<a href='http://oncokb.org/#/gene/", gene, "' target='_blank'>OncoKB</a>"), sep = ", ")
    }
  }
      
  ##### Provide link to CIViC database druggable genes ( https://civicdb.org )
  if ( gene %in% caner_genes_annot.list[["civic_clin_evid"]]$gene ) {
    summary_table.list[[ gene ]][ length(summary_table.list[[ gene ]])] <- paste( summary_table.list[[ gene ]][ length(summary_table.list[[ gene ]])] , paste0("<a href='", unique(caner_genes_annot.list[["civic_clin_evid"]][ caner_genes_annot.list[["civic_clin_evid"]]$gene == gene , "gene_civic_url"]), "' target='_blank'>CIViC</a>"), sep = ", ")
  }
}

##### Convert the list into data frame
summary_table.df <- data.frame(matrix(unlist(summary_table.list), nrow=length(summary_table.list), byrow=T),stringsAsFactors=FALSE)

##### Add gene names and number of section in which individual genes are reported
summary_table.df <- cbind(names(summary_table.list), summary_table.df)
summary_table.df <- cbind(summary_table.df, as.numeric(alt_genes.all))
colnames(summary_table.df) <- c("Gene", names(alt_genes.all.list), "Resources", "Count")

##### Add GeneCards links
summary_table.df$Gene <- paste0("<a href='https://www.genecards.org/cgi-bin/carddisp.pl?gene=", summary_table.df$Gene, "' target='_blank'>", summary_table.df$Gene, "</a>")

##### Clean the space and return output
rm(summary_table.list)

Input data summary

Reference patient cohorts

The following reference patient cohorts were used for the analysis:

Input genes

Out of the 27021 input genes 16264 are used for analyses:

  • 16257 have reliably detected expression
  • 7 are not expressed but are of interest and are included in analyses
  • 10757 are either not expressed or their expression level is too low to be detected
  • 0 genes were ignored due to lack of HGNC-approved gene symbol

NOTE, the 10764 genes with no/low expression are indicated in BLANK cells with missing values in expression summary tables in Mutated genes, Structural variants, CN altered genes, Immune markers, HRD genes and Cancer genes sections.

Library size

Bar-plot illustrating library size for each sample.

library_size

Data filtering and transformation

The read count data were converted into CPMs using edgeR functions. Genes with low counts were filtered out. The data were log2-transformed.

The CPM of 1 (cut-off for removing low expressed genes) corresponds to 16 reads in sample with the lowest sequencing depth, and 63 reads in sample with the greatest sequencing depth. The plot below presents the relation between read counts and the corresponding CPM values in the patient data. The red vertical line indicates the threshold for filtering genes with low counts.

counts_vs_transformed

Plot(s) below present CPM data distribution before and after filtering genes with low counts.

data_transformation_nonfiltered

if ( params$filter ) {
  data_transformation_filtered
}

Data normalisation

During the sample preparation or sequencing process, external factors that are not of biological interest can affect the expression of individual samples. It is assumed that all samples should have a similar range and distribution of expression values. Normalisation for sample-specific effects is required to ensure that the expression distributions of each sample are similar across the entire experiment. Normalisation is performed using TMM method.

Box-plots below present CPM data for individual samples, coloured by sample groups, before and after TMM normalisation.

data_nonnormalised

if ( params$norm != "none" ) {
  data_normalised
}

Exploratory data analysis

The expression data produced by different studies are confounded by non-biological experimental variances that prevent direct comparison of samples from different studies. In order to minimise the variance caused by confounding factors limma removeBatchEffect method was used to adjust expression measurements for potential batch effects. In brief, the strategy is to consider the investigated sample and the 40 PAAD (UMCCR) samples as one batch (regardless of the investigated sample tissue origin) and 20 TEST (TCGA) samples (of any cancer type) as another batch. The objective is to remove as much data variation due to technical factors as possible.

Principal component analysis (PCA) was performed to reduce the dimensionality of data to visually assess similarities and differences between samples. This exploratory analysis facilitates identification of the key factors affecting the variability in the expression data.

  • PCA plot

Scatter-plots of the first 2 principal components (PCs) constituting the primary source of variation in the data before and after batch effects correction.

ref_dataset.list[[dataset]][["pca_combined_data_processed"]][[2]]
#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
ref_dataset.list[[dataset]][["pca_batch_effect_corrected"]][[2]]
#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
  • Scree-plot

Scree-plots presenting the fraction of total variance (y-axis) attributed to each PC (x-axis) before and after batch effects correction. The PCs are ordered by decreasing order of contribution to total variance.

ref_dataset.list[[dataset]][["pca_combined_data_processed"]][[3]]
#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
ref_dataset.list[[dataset]][["pca_batch_effect_corrected"]][[3]]
#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
  • RLE plot

The relative log expression (RLE) plot is a useful diagnostic tool to visualise the differences between the distributions of read counts across samples. It shows boxplot of the log-ratios of the gene-level read counts (y-axis) of each sample to those of a reference sample (defined as the median across the samples). Ideally, the distributions should be centered around the zero line and as tight as possible. Clear deviations indicate the need for normalisation and/or the presence of outlying samples.

ref_dataset.list[[dataset]][["rle_combined_data_processed"]]

if ( params$norm != "none" ) {
  ref_dataset.list[[dataset]][["rle_batch_effect_corrected"]]
}

##### Present the treatment timeline plot
treatment_timeline

Findings summary

Per-alteration plot

All altered genes are summarised in the plot below. The number next to each gene indicates the number of times it appears across the following report sections: Fusion genes (supported by genomic data or reported in FusionGDB), Immune markers or HRD genes. That number is also reflected by the width of corresponding branches. Click on the category of interest to expand corresponding branches. Genes within each category are ordered by the number of report sections in which they appear and then alphabetically.

NOTE, no genes are listed in more then one section.

##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, "Findings summary")
mysql_populate_update <- paste0(mysql_populate_update, "Findings summary")

##### Present per-alteration findings summary sunburst plot for all altered genes
if ( !is.na(sunburst_plot[[2]]) ) {
  sunburst_plot[[2]]
} else {
  sunburst_plot[[1]]
}
#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
##### Present per-alteration findings summary sunburst plot for altered genes listed in at least two report sections
if ( !is.na(sunburst_plot[[2]]) ) {
  sunburst_plot[[1]]
}

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
rm(counts_vs_transformed, data_transformation_nonfiltered, data_transformation_filtered, data_nonnormalised, data_normalised, treatment_timeline, data.annot)

Per-gene table

Table summarising all altered genes listed across following report sections: Fusion genes (supported by genomic data or reported in FusionGDB), Immune markers or HRD genes. The Resources column contains links to databases that may provide additional source of evidence for the altered genes’ clinical significance. Genes ordered by the number of report sections they appear in (Count column) and then alphabetically.

NOTE, no genes are listed in more then one section.

##### Present per-gene findings summary table
findings.summary <- DT::datatable( data = summary_table.df, filter="none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrltip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = "333px", scroller = TRUE), width = 800, height = 455, caption = htmltools::tags$caption( style = 'caption-side: top; text-align: left; color:grey; font-size:100%'), escape = FALSE) %>%
      DT::formatStyle( columns = colnames(summary_table.df), `font-size` = '12px', 'text-align' = 'center' ) %>%
      ##### Colour cells according to evidence level and trust rating
      DT::formatStyle(columns = colnames(summary_table.df)[c(2:ncol(summary_table.df)-2)], 
                      backgroundColor = DT::styleEqual(c("-", "Yes"), c("transparent", "black")), color = DT::styleEqual(c("-", "Yes"), c("black", "white")))

findings.summary
##### Create directory for tables
summaryTableDir <- paste(results_dir, "summaryTables", sep = "/")
if ( !file.exists(summaryTableDir) ) {
    dir.create(summaryTableDir, recursive=TRUE)
}

saveWidgetFix(widget=findings.summary, file=paste(summaryTableDir, "findings.summary.html", sep = "/"), selfcontained=TRUE)

##### Clean the space
rm(summary_table.df, findings.summary, sunburst_plot)

Mutated genes

mRNA expression levels of genes containing single nucleotide variants (SNVs) or insertions/deletions (indels), obtained from the PCGR report, in patient’s sample and their average mRNA expression in samples from cancer cohorts. NOTE, only PCGR tier 1-4 and non-coding splice region variants are reported.

Mutation data for this sample is NOT AVAILABLE.

- Summary table

Out of the 0 mutated genes 0 include tier 1-4 variants and 0 non-coding splice region variant. Of these, the expression of 0 was reliably measured in patient’s sample. The remaining 0 genes are either not expressed or their expression level is too low to be detected (indicated in BLANK cells with missing values).

Percentiles

##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",Mutated genes")
mysql_populate_update <- paste0(mysql_populate_update, ",Mutated genes")

##### Generate expression summary table for mutated genes (based on PCGR report)
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]

##### Consider only genes with mutations calssified within user-defined tiers
genes <- ref_genes.list[["summary"]]$Mutated

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes) == 0 ) {
  genes <- NULL
  limit_genes <- FALSE
  genes_no <- 0
} else if ( length(genes) > params$top_genes ) {
  limit_genes <- TRUE
  genes_no <- params$top_genes
} else {
  limit_genes <- FALSE
  genes_no <- length(genes)
}

mut_genes.expr.perc <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)

##### Present the expression summary table
mut_genes.expr.perc[[1]]

##### Save the expression table as html file
##### Create directory for tables
if ( params$save_tables ) {
  saveWidgetFix(widget=mut_genes.expr.perc[[1]], file=paste(exprTableDir, "mut_genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
}
Table legend

The RED colour range indicate relatively high expression (percentile) values and BLUE colour range indicate relatively low expression (percentile) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each mutated gene. Variants’ tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided based on information from PCGR report. In case of multiple varaints detected in single gene the variant with the lowest tier is reported and other potential consequences are listed in column CONSEQUENCE_OTHER. Genes are ordered by increasing variants TIER and then by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column.


Z-scores

mut_genes.expr.z <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "z", scaling = scaling)[[1]]

##### Present the expression summary table
mut_genes.expr.z

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=mut_genes.expr.z, file=paste(exprTableDir, "mut_genes.expr.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(mut_genes.expr.z)
Table legend

The RED colour range indicate relatively high expression (Z-score) values and BLUE colour range indicate relatively low expression (Z-score) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each mutated gene. Variants’ tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided based on information from PCGR report. In case of multiple varaints detected in single gene the variant with the lowest tier is reported and other potential consequences are listed in column CONSEQUENCE_OTHER. Genes are ordered by increasing variants TIER and then by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column.


- Expression profiles


Fusion genes

Fusion genes prioritisation

Fusion genes detected in transcriptome data are prioritised in the following order:

  1. Involvement of fusion gene(s) detected in genomic data (if Structural variants results are available)

  2. Detected in transcriptome data by Arriba tool

  3. Reported fusion event according to FusionGDB database

  4. Decreasing number of split reads

  5. Decreasing number of pair reads

  6. Involvement of cancer gene(s) (see Cancer genes section)

Fusion genes filtering

Fusion genes detected in transcriptome data are reported if at least one of the following criteria is met:

  1. Involvement of fusion gene(s) detected in genomic data (if Structural variants results are available)

  2. Reported fusion event according to FusionGDB database

  3. Involvement of cancer gene(s) (see Cancer genes section)

  4. Split reads > 1

  5. Pair reads > 1 and split reads > 1

- Summary

Out of the 2 fusion event(s) 0 involve DNA-supported fusion genes (see Structural variants section), 0 are reported in FusionGDB and 0 involve Cancer genes.


##### Create a nice table output (with dataTable)
if ( runFusionChunk ) {
  
  ##### Update MySQL commend to populate RNA-seq data portal
  mysql_populate <- paste0(mysql_populate, ",Fusion genes")
  mysql_populate_update <- paste0(mysql_populate_update, ",Fusion genes")
  
  fusions.table <- fusions
  fusions.table$geneA <- as.vector(fusions.table$geneA)
  fusions.table$geneB <- as.vector(fusions.table$geneB)
  
  ##### Provide link to FusionGDB
  for ( i in 1:nrow(fusions.table) ) {
      if ( fusions.table$reported_fusion[i] == "Yes" ) {
        fusions.table$geneA[i] <- paste0("<a href='https://ccsm.uth.edu/FusionGDB/gene_search_result.cgi?page=page&type=quick_search&quick_search=", fusions.table$FGID[i], "' target='_blank'>", fusions.table$geneA[i], "</a>")
  
        fusions.table$geneB[i] <- paste0("<a href='https://ccsm.uth.edu/FusionGDB/gene_search_result.cgi?page=page&type=quick_search&quick_search=", fusions.table$FGID[i], "' target='_blank'>", fusions.table$geneB[i], "</a>")
      }
  }
  
  ##### Dragen + Arriba
  if ( runDragenFusionChunk && runArribaChunk ) {
    fusions.table <- fusions.table[ , names(fusions.table) %!in% "FGID" ]
    fusions.table <- fusions.table[ , c("geneA", "geneB", "split_reads", "split_readsA", "split_readsB", "discordant_mates", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB", "confidence", "score", "breakpointA", "breakpointB", "siteA", "siteB", "type", "circos")]
  names(fusions.table) <- c("Gene A", "Gene B", "Split reads (Total)", "Split reads (A)", "Split reads (B)", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Confidence (Arriba)", "Score (Dragen)", "Breakpoint (A)", "Breakpoint (B)", "Site (A)", "Site (B)", "Type", "Genomic view")
  
  ##### Arriba / Arriba + Pizzly
  } else if ( runArribaChunk ) {
    fusions.table <- fusions.table[ , names(fusions.table) %!in% "FGID" ]
    fusions.table <- fusions.table[ , c("geneA", "geneB", "split_reads", "split_readsA", "split_readsB", "discordant_mates", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB", "confidence", "breakpointA", "breakpointB", "siteA", "siteB", "type", "circos")]
  names(fusions.table) <- c("Gene A", "Gene B", "Split reads (Total)", "Split reads (A)", "Split reads (B)", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Confidence (Arriba)", "Breakpoint (A)", "Breakpoint (B)", "Site (A)", "Site (B)", "Type", "Genomic view")
  
  ##### Dragen only
  } else if ( runDragenFusionChunk ) {
    
    #####  Dragen's fusion format version 3.9.3
    if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {
        fusions.table <- fusions.table[ , names(fusions.table) %!in% "FGID" ]
        fusions.table <- fusions.table[ , c("geneA", "geneB", "split_reads", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB", "score", "breakpointA", "breakpointB", "siteA", "siteB", "circos")]
      names(fusions.table) <- c("Gene A", "Gene B", "Split reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Score", "Breakpoint (A)", "Breakpoint (B)", "Site (A)", "Site (B)", "Genomic view")
      
    #####  Dragen's fusion format prior to version 3.9.3
    } else {
      fusions.table <- fusions.table[ , names(fusions.table) %!in% "FGID" ]
        fusions.table <- fusions.table[ , c("geneA", "geneB", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB", "score", "breakpointA", "breakpointB", "circos")]
      names(fusions.table) <- c("Gene A", "Gene B", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Score", "Breakpoint (A)", "Breakpoint (B)", "Genomic view")
    }
  
  ##### Pizzly only
  } else {
    fusions.table <- fusions.table[ , names(fusions.table) %!in% "FGID" ]
    fusions.table <- fusions.table[ , c("geneA", "geneB", "split_reads", "discordant_mates", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB", "circos")]
  names(fusions.table) <- c("Gene A", "Gene B", "Split reads", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Genomic view")
  }
  
  ##### Present gene fusion events in a table
  fusions.summary <- DT::datatable( data = fusions.table, filter = "none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = "333px", scroller = TRUE), width = 800, height = 490, caption = htmltools::tags$caption(style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) %>%
      DT::formatStyle( columns = names(fusions.table), `font-size` = '12px', 'text-align' = 'center' ) %>%
    
      ##### Highlight rows with fusions involving cancer genes (grey) or DNA support (from MANTA, orange)
      DT::formatStyle( columns = colnames(fusions.table) %in% "Cancer gene(s)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'lightgrey')) ) %>%
      DT::formatStyle( columns = colnames(fusions.table) %in% "DNA support (A)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'coral')) ) %>%
      DT::formatStyle( columns = colnames(fusions.table) %in% "DNA support (B)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'coral')) ) %>%
      DT::formatStyle( columns = colnames(fusions.table) %in% "Reported fusion", backgroundColor = DT::styleEqual( c("-", "Yes"), c('transparent', 'lightgreen')) )
  
  fusions.summary
  
} else {
  
  ##### Create empty table
  fusions.table <- data.frame(matrix(ncol = 18, nrow = 0))
  
  names(fusions.table) <- c("Gene A", "Gene B", "Split reads", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Breakpoint (A)", "Breakpoint (B)", "Genomic view")

  ##### Present gene fusion events in a table
  fusions.summary <- DT::datatable( data = fusions.table, filter = "none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = "333px", scroller = TRUE), width = 800, height = 490, caption = htmltools::tags$caption(style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) %>%
      DT::formatStyle( columns = names(fusions.table), `font-size` = '12px', 'text-align' = 'center' )
  
  fusions.summary
}
##### Save the table as html file
if ( params$save_tables ) {
  
  ##### Create directory for tables
  fusionsTableDir <- paste(results_dir, "fusionsTables", sep = "/")
  if ( !file.exists(fusionsTableDir) ) {
          dir.create(fusionsTableDir, recursive=TRUE)
  }

  saveWidgetFix(widget=fusions.summary, file=paste(fusionsTableDir, "fusions.summary.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space and return output
rm(fusions.table, fusions.summary)
Table legend

Cells in RED indicate DNA-supported fusion genes (see Structural variants section), cells in GREEN indicate fusion events reported in FusionGDB, and cells in GREY indicate fusions containing Cancer genes. Gene fusions reported in FusionGDB are hyperlinked. Genes known to be involved in gene fusions are flagged based on information provided in FusionGDB and Cancer Genome Interpreter (CGI) databases. Breakpoint (A/B) - genomic coordinates of the breakpoints in gene A/B; Site (A/B) - location of the breakpoints in gene A/B; Type - type of event based on the orientation of the supporting reads and the coordinates of breakpoints

Fusion events are ordered by the following columns:

DNA support (A/B): DNA-supported fusion gene(s) (see Structural variants section)

Confidence level from Arriba tool

Reported fusion: fusion event reported in FusionGDB

Split count: the number of supporting split reads

Pair count: the number of supporting pair reads

Cancer gene(s): gene fusion events involving Cancer genes

Fusion gene (A/B): gene(s) known to be involved in tumorigenesis across cancer types based on FusionGDB and CGI databases


- Genomic view

0 DNA-supported fusion genes (see Structural variants section) and 0 fusions events reported in FusionGDB are presented in the genomic context. Red colour is used for links between positions of same chromosomes and blue for links between different chromosomes. The table at the bottom contains genomic coordingates of individual fusion genes sorted based on their genomic location.

NOTE: 0 of such fusions do not have genomic information available and are not presented on the circos plot (see Genomic view column in the - Summary table).

Genomic data for this sample is NOT AVAILABLE.

##### Keep only reported fusions or those with or cancer gene(s) involved
if ( runSVsChunk ) {
  fusion_annot_top <- fusion_annot[ fusion_annot$reported_fusion == "Yes" | fusion_annot$geneA_dna_support == "Yes" | fusion_annot$geneB_dna_support == "Yes" , ]
} else {
  fusion_annot_top <- fusion_annot[ fusion_annot$reported_fusion == "Yes" , ]
}

if ( nrow(fusion_annot_top) > 0 ) {
  
  ##### Create folder for fusion plots
  fusionsPlotDir <- paste(results_dir, "fusionsPlot", sep = "/")
    
  if ( !file.exists(fusionsPlotDir) ) {
    dir.create(fusionsPlotDir, recursive=TRUE)
  }
  
  ##### Prepare object for RCircos
  eval(parse( text=paste0("data(UCSC.HG", params$ucsc_genome_assembly, ".Human.CytoBandIdeogram)")))
  cyto.info <- eval(parse( text=paste0("UCSC.HG", params$ucsc_genome_assembly, ".Human.CytoBandIdeogram")))
    
  ##### Check if all driver genes are located in standard chromosomes
  fusion_annot_top <- fusion_annot_top[ paste0("chr", fusion_annot_top$SEQNAME) %in% cyto.info$Chromosome,  ]
  
  fusion_annot_top.circos.pairs <- fusion_annot_top[, c("SEQNAME", "GENESEQSTART", "GENESEQEND", "SYMBOL","SEQNAME.1", "GENESEQSTART.1", "GENESEQEND.1", "SYMBOL.1")]
  
  ##### Add "chr" to chromosome numbers
  fusion_annot_top.circos.pairs$SEQNAME <- paste0("chr", fusion_annot_top.circos.pairs$SEQNAME)
  fusion_annot_top.circos.pairs$SEQNAME.1 <- paste0("chr", fusion_annot_top.circos.pairs$SEQNAME.1)
  
  ##### Change column names
  names(fusion_annot_top.circos.pairs) <- gsub("SEQNAME", "Chromosome", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("GENESEQSTART", "chromStart", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("GENESEQEND", "chromEnd", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("SYMBOL", "Gene", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("Chromosome.1", "Chromosome", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("chromStart.1", "chromStart", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("chromEnd.1", "chromEnd", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("Gene.1", "Gene", names(fusion_annot_top.circos.pairs))
  
  ##### Remove entries with missing genomic coordinates
  fusion_annot_top.circos.pairs <- fusion_annot_top.circos.pairs[complete.cases(fusion_annot_top.circos.pairs), ]
  fusion_annot_top.circos <- rbind(fusion_annot_top.circos.pairs[, 1:4 ], fusion_annot_top.circos.pairs[, 5:8 ])
  fusion_annot_top.circos.pairs <- fusion_annot_top.circos.pairs[, colnames(fusion_annot_top.circos.pairs) %!in% c("Gene", "Gene.1") ]
  
  ##### Generate circos plot
  RCircos.Set.Core.Components( cyto.info=cyto.info, chr.exclude=NULL, tracks.inside=4, tracks.outside=0 )
  RCircos.Set.Plot.Area()  
  RCircos.Chromosome.Ideogram.Plot()
  RCircos.Gene.Connector.Plot(genomic.data = fusion_annot_top.circos, track.num = 1, side="in") 
  RCircos.Gene.Name.Plot(gene.data = fusion_annot_top.circos, name.col = 4, track.num = 2, side = "in")
  RCircos.Link.Plot(link.data = fusion_annot_top.circos.pairs, track.num=4, by.chromosome=TRUE, is.sorted=FALSE, lineWidth=rep(2, nrow(fusion_annot_top.circos.pairs)))
}
##### Generate circos plot representing gene fusion events. NOTE. Only fusions involving fusion genes supported by MANTA or reported fusions are presented
if ( nrow(fusion_annot_top) > 0 ) {
  
  ##### Save circos into a png file
  png( filename = paste(fusionsPlotDir, "circosPlot.png", sep="/"), width = 800, height = 800, units = "px", pointsize = 24 )
  RCircos.Set.Core.Components( cyto.info=cyto.info, chr.exclude=NULL, tracks.inside=4, tracks.outside=0 )
  RCircos.Set.Plot.Area()  
  RCircos.Chromosome.Ideogram.Plot()
  RCircos.Gene.Connector.Plot(genomic.data = fusion_annot_top.circos, track.num = 1, side="in") 
  RCircos.Gene.Name.Plot(gene.data = fusion_annot_top.circos, name.col = 4, track.num = 2, side = "in")
  RCircos.Link.Plot(link.data = fusion_annot_top.circos.pairs, track.num=4, by.chromosome=TRUE, is.sorted=FALSE, lineWidth=rep(2, nrow(fusion_annot_top.circos.pairs)))
  invisible(dev.off())
    
  ##### Clean the space
  rm(fusion_annot_top.circos, fusion_annot_top.circos.pairs)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
  
} else {
  cat("None of the transcriptome-based fusion events have supporting evidence from DNA data or was previously reported.")
}
None of the transcriptome-based fusion events have supporting evidence from DNA data or was previously reported.
if ( nrow(fusion_annot_top) > 0 ) {
  
  ##### Clean the table for better presentation
  ##### Dragen + Arriba / Pizzly + Arriba
  if ( runDragenFusionChunk && runArribaChunk ) {
    fusion_annot_top.clean <- fusion_annot_top[, c("SYMBOL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "SYMBOL.1", "SEQNAME.1", "GENESEQSTART.1", "GENESEQEND.1", "breakpointA", "breakpointB", "split_reads", "split_readsA", "split_readsB", "discordant_mates", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer") ]
    
  ##### Dragen only
  } else if ( runDragenFusionChunk ) {
    
    #####  Dragen's fusion format version 3.9.3
    if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {
      fusion_annot_top.clean <- fusion_annot_top[, c("SYMBOL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "SYMBOL.1", "SEQNAME.1", "GENESEQSTART.1", "GENESEQEND.1", "breakpointA", "breakpointB", "split_reads", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer") ]
      
    #####  Dragen's fusion format prior to version 3.9.3
    } else {
      fusion_annot_top.clean <- fusion_annot_top[, c("SYMBOL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "SYMBOL.1", "SEQNAME.1", "GENESEQSTART.1", "GENESEQEND.1", "breakpointA", "breakpointB", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer") ]
    }
    
  ##### Pizzly only
  } else {
    fusion_annot_top.clean <- fusion_annot_top[, c("SYMBOL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "SYMBOL.1", "SEQNAME.1", "GENESEQSTART.1", "GENESEQEND.1", "split_reads", "discordant_mates", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer") ]
  }
  
  ##### Order fusions based on the genomic location (chrom and start positions)
  chrOrder <-c((1:22),"X","Y","M")
  
  fusion_annot_top.clean$SEQNAME <- factor(fusion_annot_top.clean$SEQNAME, chrOrder, ordered=TRUE)
  fusion_annot_top.clean$SEQNAME.1 <- factor(fusion_annot_top.clean$SEQNAME.1, chrOrder, ordered=TRUE)
  fusion_annot_top.clean <- fusion_annot_top.clean[do.call(order, fusion_annot_top.clean[, c("SEQNAME", "SEQNAME.1", "GENESEQSTART", "GENESEQSTART.1")]), ]
  
  ##### Dragen + Arriba / Pizzly + Arriba
  if ( runDragenFusionChunk && runArribaChunk) {
    names(fusion_annot_top.clean) <- c("Gene A", "Chrom (A)", "Start (A)", "End (A)", "Gene B", "Chrom (B)", "Start (B)", "End (B)", "Breakpoint (A)", "Breakpoint (B)", "Split reads (Total)", "Split reads (A)", "Split reads (B)", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)")
    
  ##### Dragen only
  } else if ( runDragenFusionChunk ) {
    
    #####  Dragen's fusion format version 3.9.3
    if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {
      names(fusion_annot_top.clean) <- c("Gene A", "Chrom (A)", "Start (A)", "End (A)", "Gene B", "Chrom (B)", "Start (B)", "End (B)", "Breakpoint (A)", "Breakpoint (B)", "Split reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)")
      
    #####  Dragen's fusion format prior to version 3.9.3
    } else {
      names(fusion_annot_top.clean) <- c("Gene A", "Chrom (A)", "Start (A)", "End (A)", "Gene B", "Chrom (B)", "Start (B)", "End (B)", "Breakpoint (A)", "Breakpoint (B)", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)")
    }
    
  ##### Pizzly only
  } else {
    names(fusion_annot_top.clean) <- c("Gene A", "Chrom (A)", "Start (A)", "End (A)", "Gene B", "Chrom (B)", "Start (B)", "End (B)", "Split reads", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)")
  }

  fusions.genomicView <- DT::datatable( data = fusion_annot_top.clean, filter="none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = "167px", scroller = TRUE), width = 800, height = 318,  escape = FALSE) %>%
      DT::formatStyle( columns = names(fusion_annot_top.clean), `font-size` = '12px', 'text-align' = 'center' ) %>%
    
      ##### Highlight rows with fusions involving cancer genes (grey)
      DT::formatStyle( columns = colnames(fusion_annot_top.clean) %in% "Cancer gene(s)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'lightgrey')) ) %>%
      DT::formatStyle( columns = colnames(fusion_annot_top.clean) %in% "DNA support (A)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'coral')) ) %>%
      DT::formatStyle( columns = colnames(fusion_annot_top.clean) %in% "DNA support (B)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'coral')) ) %>%
    DT::formatStyle( columns = colnames(fusion_annot_top.clean) %in% "Reported fusion", backgroundColor = DT::styleEqual( c("-", "Yes"), c('transparent', 'lightgreen')) )

  fusions.genomicView
}

##### Clean the space
rm(fusion_annot_top.clean)
Table legend

Cells in RED indicate DNA-supported fusion genes (see Structural variants section), cells in GREEN indicate gene fusions reported in FusionGDB, and cells hihglighted in GREY indicate fusions containing Cancer genes. Genes known to be involved in gene fusions are flagged based on information provided in FusionGDB and Cancer Genome Interpreter (CGI) databases. Fusion events are ordered by genomic coordinates of Gene A and then Gene B. DNA support (gene A/B) - DNA-supported fusion gene(s) (see) Structural variants section); Reported fusion - fusion event reported in FusionGDB; Cancer gene(s) - gene fusion events involving Cancer genes

##### Save the table as html file
if ( nrow(fusion_annot_top) > 0 && params$save_tables ) {
  saveWidgetFix(widget=fusions.genomicView, file=paste(fusionsTableDir, "fusions.genomicView.html", sep = "/"), selfcontained=TRUE)  
}

##### Clean the space and return output
rm(fusions.genomicView)

- Top hits

Expression profiles for gene fusion events involving DNA-supported fusion genes (see Structural variants section), gene fusions reported in FusionGDB or Cancer genes, indicated in green, red and grey columns in the Fusion genes table, respectively, and with the highest Split count and Pair count values.

NOTE: the visualisation is available only for fusion genes detected by Arriba (see the - Summary table).

ATAD2-FBXO32

Fusion genes expression
mRNA expression levels of fusion genes detected in patient’s sample and their average mRNA expression (Z-score) in samples from cancer cohorts.
Plot legend

Distribution of percentile values (y-axis) as a function of expression levels (Z-scores, x-axis) of ATAD2 in patient’s sample (black dot) and other reference cancer cohort(s) (median value(s)).

Read counts

Bar-plot illustrating read counts for ATAD2 across all samples. The ATAD2 read count in patient’s sample is indicated by black bar.

Expression distribution patterns

Plot illustrating distribution of expression levels (Z-scores) of ATAD2 observed across all samples along with simulated normal and bimodal distributions. The ATAD2 expression level observed in patient’s sample is indicated by black dot in each distribution.


Plot legend

Distribution of percentile values (y-axis) as a function of expression levels (Z-scores, x-axis) of FBXO32 in patient’s sample (black dot) and other reference cancer cohort(s) (median value(s)).

Read counts

Bar-plot illustrating read counts for FBXO32 across all samples. The FBXO32 read count in patient’s sample is indicated by black bar.

Expression distribution patterns

Plot illustrating distribution of expression levels (Z-scores) of FBXO32 observed across all samples along with simulated normal and bimodal distributions. The FBXO32 expression level observed in patient’s sample is indicated by black dot in each distribution.


Summary table
Percentiles

Table legend

The RED colour range indicate relatively high expression (percentile) values and BLUE colour range indicate relatively low expression (percentile) values in individual sample group. The Diff (Patient vs TEST (TCGA) ) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each fusion gene. Genes considered to be oncogenes or tumour suppressor genes, according to OncoKB database, are also indicated. Genes are ordered by decreasing absolute values in the Diff (Patient vs TEST (TCGA) ) column. TSG - tumour suppressor gene


Z-scores

Table legend

The RED colour range indicate relatively high expression (Z-score) values and BLUE colour range indicate relatively low expression (Z-score) values in individual sample group. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each fusion gene. Genes considered to be oncogenes or tumour suppressor genes, according to OncoKB database, are also indicated. Genes are ordered by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column. TSG - tumour suppressor gene


NRIP1-AF127577.4;LINC02246

Fusion genes expression
mRNA expression levels of fusion genes detected in patient’s sample and their average mRNA expression (Z-score) in samples from cancer cohorts.
Plot legend

Distribution of percentile values (y-axis) as a function of expression levels (Z-scores, x-axis) of NRIP1 in patient’s sample (black dot) and other reference cancer cohort(s) (median value(s)).

Read counts

Bar-plot illustrating read counts for NRIP1 across all samples. The NRIP1 read count in patient’s sample is indicated by black bar.

Expression distribution patterns

Plot illustrating distribution of expression levels (Z-scores) of NRIP1 observed across all samples along with simulated normal and bimodal distributions. The NRIP1 expression level observed in patient’s sample is indicated by black dot in each distribution.


NOTE, expression data is not available for AF127577.4;LINC02246.

Summary table
Percentiles

Table legend

The RED colour range indicate relatively high expression (percentile) values and BLUE colour range indicate relatively low expression (percentile) values in individual sample group. The Diff (Patient vs TEST (TCGA) ) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each fusion gene. Genes considered to be oncogenes or tumour suppressor genes, according to OncoKB database, are also indicated. Genes are ordered by decreasing absolute values in the Diff (Patient vs TEST (TCGA) ) column. TSG - tumour suppressor gene


Z-scores

Table legend

The RED colour range indicate relatively high expression (Z-score) values and BLUE colour range indicate relatively low expression (Z-score) values in individual sample group. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each fusion gene. Genes considered to be oncogenes or tumour suppressor genes, according to OncoKB database, are also indicated. Genes are ordered by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column. TSG - tumour suppressor gene



Structural variants

mRNA expression levels of genes located within detected structural variants (SVs), obtained from Manta SV caller, in patient’s sample and their average mRNA expression in samples from cancer cohorts.

SVs information for this sample is NOT AVAILABLE

- Summary table

Out of the genes affected by SVs, the expression of 0 was reliably measured in patient’s sample. The remaining genes are either not expressed or their expression level is too low to be detected (indicated in BLANK cells with missing values).

Percentiles

##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",Structural variants")
mysql_populate_update <- paste0(mysql_populate_update, ",Structural variants")

##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]

##### Consider only SVs with known genes and those in MANTA output for which the expression levels were measured
genes <- unique(manta_sv$Gene)
genes <- genes[ genes %in% ref_dataset.list[[dataset]][["gene_annot_all"]]$SYMBOL ]

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes) == 0 ) {
  genes <- NULL
  limit_genes <- FALSE
  genes_no <- 0
} else if ( length(genes) > params$top_genes ) {
  limit_genes <- TRUE
  genes_no <- params$top_genes
} else {
  limit_genes <- FALSE
  genes_no <- length(genes)
}

sv_genes.expr.perc <- exprTable( genes = genes, data = data, sv_data = manta_sv, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)

##### Present the expression summary table
sv_genes.expr.perc[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=sv_genes.expr.perc[[1]], file=paste(exprTableDir, "sv_genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
}
Table legend

The RED colour range indicate relatively high expression (percentile) values and BLUE colour range indicate relatively low expression (percentile) values in individual sample group. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each gene. Genes considered to be oncogenes or tumour suppressor genes, according to OncoKB database, are also indicated. Genes are ordered by increasing SV score and then by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) columns. TSG - tumour suppressor gene

Tier: SV priority score based on AstraZeneca simple_sv_annotation.py script; 1 = high and 4 = low priority


Z-scores

##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
sv_genes.expr.z <- exprTable( genes = genes, data = data, sv_data = manta_sv, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "z", scaling = scaling)[[1]]

##### Present the expression summary table
sv_genes.expr.z

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=sv_genes.expr.z, file=paste(exprTableDir, "sv_genes.expr.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(sv_genes.expr.z)
Table legend

The RED colour range indicate relatively high expression (Z-score) values and BLUE colour range indicate relatively low expression (Z-score) values in individual sample group. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each gene. Genes considered to be oncogenes or tumour suppressor genes, according to OncoKB database, are also indicated. Genes are ordered by increasing SV score and then by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column. TSG - tumour suppressor gene

Tier: SV priority score based on AstraZeneca simple_sv_annotation.py script; 1 = high and 4 = low priority


- Expression profiles


CN altered genes

Section overlaying the mRNA expression data with per-gene somatic copy-number (CN) data (from PURPLE), as well as SNVs/indels and SVs data, if available.

CN information for this sample is NOT AVAILABLE.

- Genomic view

genes with available CN data (y-axis) are presented in the genomic context (x-axis). 0 of them (indicated by various colours) are Cancer genes and are gained or lost . All other genes are marked in gray or black.

##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",CN altered genes")
mysql_populate_update <- paste0(mysql_populate_update, ",CN altered genes")

##### Generate genomic view plot with per-gene CN values (y-axis) along chromosomal coordinates (x-axis)
suppressMessages(library(manhattanly))
suppressMessages(library(plotly))

data <- ref_dataset.list[[dataset]][["expr_mut_cn_data_all"]]
data.sub <- ref_dataset.list[[dataset]][["expr_mut_cn_data"]]

##### Add SNVs
if ( runPcgrChunk ) {
  data$Alterations <- as.character(data$Alterations)
  data.sub$Alterations <- as.character(data.sub$Alterations)
}

##### Add fusion genes
if ( runFusionChunk ) {
  
  ##### Change the alteration type to "fusion" for fusion genes
  data$Alterations[ data$Gene %in% fusions$geneA  ] <- paste0( data$Alterations[ data$Gene %in% fusions$geneA  ], "; Fusion")
  data.sub$Alterations[ data.sub$Gene %in% fusions$geneA  ] <- paste0( data.sub$Alterations[ data.sub$Gene %in% fusions$geneA  ], "; Fusion")
  data$Alterations[ data$Gene %in% fusions$geneB  ] <- paste0( data$Alterations[ data$Gene %in% fusions$geneB  ], "; Fusion")
  data.sub$Alterations[ data.sub$Gene %in% fusions$geneB  ] <- paste0( data.sub$Alterations[ data.sub$Gene %in% fusions$geneB  ], "; Fusion")
}
            
##### Add genes involved in SVs (if data available)
if ( runSVsChunk ) {
  
  ##### Change the alteration type to "fusion" for fusion genes
  data$Alterations[ data$Gene %in% unique(manta_sv$Gene)  ] <- paste0( data$Alterations[ data$Gene %in% unique(manta_sv$Gene)  ], "; SV")
  ##### Change the alteration type to "fusion" for fusion genes
  data.sub$Alterations[ data.sub$Gene %in% unique(manta_sv$Gene)  ] <- paste0( data.sub$Alterations[ data.sub$Gene %in% unique(manta_sv$Gene)  ], "; SV")
}

##### Remove altaration status "None" for gene which are not mutated but are involved in fusions or SVs
data$Alterations <- gsub( "None", "CN", data$Alterations)
data.sub$Alterations <- gsub( "None", "CN", data.sub$Alterations)

##### Prepare dataframe for manhattanly
##### Keep only genes for which both genes have gene symbol (and genomics location) available
data <- data[ data$Gene %in% ref_dataset.list[[dataset]][["gene_annot"]]$SYMBOL, ]
names(data)[match("CN", names(data))] <- "P"

##### Merge genes genomic coordinates info with their annotation and expression data
data.annot <- merge(data, ref_dataset.list[[dataset]][["gene_annot"]], by.x = "Gene", by.y = "SYMBOL", all.x = FALSE)
data.annot$SEQNAME <- as.numeric(data.annot$SEQNAME)
data.annot$GENESEQSTART <- as.numeric(data.annot$GENESEQSTART)
data.annot <- data.annot[ !is.na(data.annot$SEQNAME), ]

if ( nrow(data.annot) > 0 ) {
  
  ##### Get plot results first to extract x-axis coordinated to annotate genes of interest
  manhattanr.res <- manhattanr(x = data.annot, chr = "SEQNAME", bp = "GENESEQSTART", p = "P", snp = "Gene", gene = "Z_score_diff", annotation1 = "Perc_diff", annotation2 = "Alterations", logp = FALSE)
  
  ##### Restrict the results to the genes of interest
  manhattanr.res$data <- manhattanr.res$data[ manhattanr.res$data$Gene %in% data.sub$Gene, ]
  
  p <- manhattanly(x = data.annot, chr = "SEQNAME", bp = "GENESEQSTART", p = "P", snp = "Gene", gene = "Z_score_diff", annotation1 = "Perc_diff", annotation2 = "Alterations", suggestiveline = cn_top, genomewideline  = cn_bottom, suggestiveline_color = "gray", genomewideline_color = "gray", ylab = "CN value", showgrid = FALSE, title = "", logp = FALSE) %>%
    
    add_markers(y = manhattanr.res$data$P, x = manhattanr.res$data$pos, 
                name = manhattanr.res$data$Gene,
                text = paste0("Gene: ", manhattanr.res$data$Gene, "\nZ_score_diff: ", manhattanr.res$data$Z_score_diff, "\nPerc_diff: ", manhattanr.res$data$Perc_diff, "\nAlterations: ", manhattanr.res$data$Alterations, "\nchr: ", manhattanr.res$data$CHR),
                mode = 'markers',
                marker = list(size=10, symbol="circle"),
                color = manhattanr.res$data$Gene,
                showlegend = TRUE,
                legendtitle=TRUE, 
                inherit = FALSE) %>%
    
    add_annotations( data = manhattanr.res$data, text=~Gene,
                      x=~pos, xanchor="left",
                      y=~P, yanchor="top",
                      font = list(color = "Grey", size = 10),
                      legendtitle=TRUE,
                      showarrow=FALSE )
  
  ##### Create directory for the plots
  PlotDir <- paste(results_dir, "cn_genomic_view", sep = "/")
  if ( !file.exists(PlotDir) ) {
    dir.create(PlotDir, recursive=TRUE)
  }
  
  ##### Save interactive plot as html file
  saveWidgetFix(p, file = paste(PlotDir, "cn_genomic_view.html", sep = "/"))
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
  
} else {
  cat("None of the genes of interest are affected by changes in CN.")
  p <- NULL
}

p

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:manhattanly", unload=FALSE)
detach("package:plotly", unload=FALSE)

CN data distribution

CN information for this sample is NOT AVAILABLE.

##### Generate a histogram illustrating CN data distribution
suppressMessages(library(plotly))
cn_dist_plot

##### Create directory for the plots
PlotDir <- paste(results_dir, "cn_dist_plot", sep = "/")
if ( !file.exists(PlotDir) ) {
  dir.create(PlotDir, recursive=TRUE)
}

##### Save interactive plot as html file
saveWidgetFix(cn_dist_plot, file = paste(PlotDir, "cn_dist_plot.html", sep = "/"))
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())

- Expression vs CN

Scatterplot comparing the per-gene difference in mRNA expression of Cancer genes between patient’s sample and cancer individuals (y-axis), and CN values (x-axis, from PURPLE).

Percentiles

##### Generate scatterplot with per-gene expression values (y-axis), CN values (x-axis) and mutation status info (colours)
suppressMessages(library(plotly))
cn_genes <- data.sub$Gene

if ( runPcgrChunk && length(cn_genes) > 0 ) {
  mutCNexprPlot(data = data.sub, alt_data = TRUE, cn_bottom = cn_bottom, cn_top = cn_top, comp_cancer = comp_cancer_group, type = "perc", report_dir = results_dir)
  
} else if ( length(cn_genes) > 0) {
  data.sub <- data.sub[ data.sub$Gene %in% cn_genes, ]
  mutCNexprPlot(data = data.sub, alt_data = FALSE, cn_bottom = cn_bottom, cn_top = cn_top, comp_cancer = comp_cancer_group, type = "perc", report_dir = results_dir)
  
} else {
  cn_genes <- NULL
  cat("None of the genes of interest are affected by changes in CN.")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())

Z-scores

##### Generate scatterplot with per-gene expression values (y-axis), CN values (x-axis) and mutation status info (colours)
suppressMessages(library(plotly))

if ( runPcgrChunk && length(cn_genes) > 0 ) {
  data.sub <- data.sub[ data.sub$Gene %in% cn_genes, ]
  mutCNexprPlot(data = data.sub, alt_data = TRUE, cn_bottom = cn_bottom, cn_top = cn_top, comp_cancer = comp_cancer_group, type = "z", report_dir = results_dir)
  
} else if ( length(cn_genes) > 0) {
  data.sub <- data.sub[ data.sub$Gene %in% cn_genes, ]
  mutCNexprPlot(data = data.sub, alt_data = FALSE, cn_bottom = cn_bottom, cn_top = cn_top, comp_cancer = comp_cancer_group, type = "z", report_dir = results_dir)
  
} else {
  cn_genes <- NULL
  cat("None of the genes of interest are affected by changes in CN.")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())

- Summary table

Out of the genes within gained or lost regions 0 are Cancer genes. The expression of 0 of these genes was reliably measured in patient’s sample. The remaining 0 genes are either not expressed or their expression level is too low to be detected (indicated in BLANK cells with missing values).

Gains

Table summarising the mRNA expression values in cancer and patient samples for genes with CN values >= (NA) (gains), based on patient’s genomic data (from PURPLE), and mutation status if available (from PCGR).

Percentiles
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
##### Keep only genes within CN gains
cn_data <- ref_dataset.list[[dataset]][["expr_mut_cn_data"]]
cn_data <- cn_data[ cn_data$CN >= cn_top, ]
cn_data <- cn_data[, "CN", drop=FALSE]
genes_gains = as.character(cn_genes[ cn_genes %in% rownames(cn_data) ])

##### Deal with no genes
if ( length(genes_gains) == 0 ) {
  genes_gains <- NULL
  genes_gains_no <- 0
} else if ( length(genes_gains) > params$top_genes ) {
  genes_gains_no <- params$top_genes
} else {
  genes_gains_no <- length(genes_gains)
}

##### Get expression data
data <- ref_dataset.list[[dataset]][["data_to_report"]]

if ( runPcgrChunk && runPurpleChunk ) {
  cn_expr_genes.expr.gains.perc <- exprTable( genes = genes_gains, data = data, cn_data = cn_data, cn_decrease = TRUE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)
  
##### Generate expression summary table for per-gene expression values and CN values
} else if ( runPurpleChunk ) {
  cn_expr_genes.expr.gains.perc <- exprTable( genes = genes_gains, data = data, cn_data = cn_data, cn_decrease = TRUE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)
}
  
##### Present the expression, CN and mutation data summary table
cn_expr_genes.expr.gains.perc[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cn_expr_genes.expr.gains.perc[[1]], file=paste(exprTableDir, "cn_expr_genes.expr.gains.perc.html", sep = "/"), selfcontained=TRUE)
}
Table legend

The RED colour range indicate relatively high expression (percentile) values and BLUE colour range indicate relatively low expression (percentile) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each gene. The CN values based on patient’s genomic data are presented in Patient (CN) column with a horizontal blue bar indicating the CN value of each gene in the context of other genes. If mutation data is availbale, then the variants’ tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided on right-hand side based on information from PCGR report (similar to Mutated genes section). Genes are ordered by Patient (CN) and then by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) columns. CN - copy-number


Z-scores
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
if ( runPcgrChunk && runPurpleChunk ) {
  cn_expr_genes.expr.gains.z <- exprTable( genes = genes_gains, data = data, cn_data = cn_data, cn_decrease = TRUE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "z", scaling = scaling)
  
##### Generate expression summary table for per-gene expression values and CN values
} else if ( runPurpleChunk ) {
  cn_expr_genes.expr.gains.z <- exprTable( genes = genes_gains, data = data, cn_data = cn_data, cn_decrease = TRUE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "z", scaling = scaling)
}
  
##### Present the expression, CN and mutation data summary table
cn_expr_genes.expr.gains.z[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cn_expr_genes.expr.gains.z[[1]], file=paste(exprTableDir, "cn_expr_genes.expr.gains.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(cn_expr_genes.expr.gains.z, cn_data)
Table legend

The RED colour range indicate relatively high expression (Z-score) values and BLUE colour range indicate relatively low expression (Z-score) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each gene. The CN values based on patient’s genomic data are presented in Patient (CN) column with a horizontal blue bar indicating the CN value of each gene in the context of other genes. If mutation data is availbale, then the variants’ tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided on right-hand side based on information from PCGR report (similar to Mutated genes section). Genes are ordered by Patient (CN) and then by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) columns. CN - copy-number


Losses

Table summarising the mRNA expression values in cancer and patient samples for genes with CN values =< (NA) (losses), based on patient’s genomic data (from PURPLE), and mutation status if available (from PCGR).

Percentiles
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
##### Keep only genes within CN losses
cn_data <- ref_dataset.list[[dataset]][["expr_mut_cn_data"]]
cn_data <- cn_data[ cn_data$CN <= cn_bottom, ]
cn_data <- cn_data[, "CN", drop=FALSE]
genes_losses = as.character(cn_genes[ cn_genes %in% rownames(cn_data) ])

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes_losses) == 0 ) {
  genes_losses <- NULL
  genes_losses_no <- 0
} else if ( length(genes_losses) > params$top_genes ) {
  genes_losses_no <- params$top_genes
} else {
  genes_losses_no <- length(genes_losses)
}
  
if ( genes_gains_no + genes_losses_no > params$top_genes ) {
  limit_genes <- TRUE
} else {
  limit_genes <- FALSE
}

##### Get expression data
data <- ref_dataset.list[[dataset]][["data_to_report"]]

if ( runPcgrChunk && runPurpleChunk ) {
  cn_expr_genes.expr.losses.perc <- exprTable( genes = genes_losses, data = data, cn_data = cn_data, cn_decrease = FALSE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)
  
##### Generate expression summary table for per-gene expression values and CN values
} else if ( runPurpleChunk ) {
  cn_expr_genes.expr.losses.perc <- exprTable( genes = genes_losses, data = data, cn_data = cn_data, cn_decrease = FALSE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)
}
  
##### Present the expression, CN and mutation data summary table
cn_expr_genes.expr.losses.perc[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cn_expr_genes.expr.losses.perc[[1]], file=paste(exprTableDir, "cn_expr_genes.expr.losses.perc.html", sep = "/"), selfcontained=TRUE)
}
Table legend

The RED colour range indicate relatively high expression (percentile) values and BLUE colour range indicate relatively low expression (percentile) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each gene. The CN values based on patient’s genomic data are presented in Patient (CN) column with a horizontal blue bar indicating the CN value of each gene in the context of other genes. If mutation data is availbale, then the variants’ tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided on right-hand side based on information from PCGR report (similar to Mutated genes section). Genes are ordered by Patient (CN) and then by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) columns. CN - copy-number


Z-scores
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
if ( runPcgrChunk && runPurpleChunk ) {
  cn_expr_genes.expr.losses.z <- exprTable( genes = genes_losses, data = data, cn_data = cn_data, cn_decrease = FALSE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "z", scaling = scaling)
  
##### Generate expression summary table for per-gene expression values and CN values
} else if ( runPurpleChunk ) {
  cn_expr_genes.expr.losses.z <- exprTable( genes = genes_losses, data = data, cn_data = cn_data, cn_decrease = FALSE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "z", scaling = scaling)
}
  
##### Present the expression, CN and mutation data summary table
cn_expr_genes.expr.losses.z[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cn_expr_genes.expr.losses.z[[1]], file=paste(exprTableDir, "cn_expr_genes.expr.losses.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space and return output
rm(cn_data, cn_expr_genes.expr.losses.z)
Table legend

The RED colour range indicate relatively high expression (Z-score) values and BLUE colour range indicate relatively low expression (Z-score) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each gene. The CN values based on patient’s genomic data are presented in Patient (CN) column with a horizontal blue bar indicating the CN value of each gene in the context of other genes. If mutation data is availbale, then the variants’ tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided on right-hand side based on information from PCGR report (similar to Mutated genes section). Genes are ordered by Patient (CN) and then by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) columns. CN - copy-number


- Expression profiles

Gains


Losses


Immune markers

Section presenting expression levels of immune markers to assess pre-existing anti-cancer immunity and likelihood of response to immunotherapy. Their mRNA expression levels are presented in patient’s sample along their average mRNA expression in samples from cancer cohorts.

Out of the 70 immune markers the expression of 70 was reliably measured in patient’s sample. The remaining 0 genes are either not expressed or their expression level is too low to be detected (indicated in BLANK cells with missing values).

- Summary table

Percentiles

##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",Immune markers")
mysql_populate_update <- paste0(mysql_populate_update, ",Immune markers")

##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]
genes <- unique(unlist(ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL))

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes) == 0 ) {
  genes <- NULL
}

immune_genes.expr.perc <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL", "Immune_Cycle_Role")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "perc", scaling = scaling)[[1]]

##### Present the expression summary table
immune_genes.expr.perc
##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=immune_genes.expr.perc, file=paste(exprTableDir, "immune_genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(immune_genes.expr.perc)
Table legend

The RED colour range indicate relatively high expression (percentile) values and BLUE colour range indicate relatively low expression (percentile) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each immune marker. Genes are ordered by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column.


Z-scores

##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
immune_genes.expr.z <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL", "Immune_Cycle_Role")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "z", scaling = scaling)[[1]]

##### Present the expression summary table
immune_genes.expr.z
##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=immune_genes.expr.z, file=paste(exprTableDir, "immune_genes.expr.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(immune_genes.expr.z)
Table legend

The RED colour range indicate relatively high expression (Z-score) values and BLUE colour range indicate relatively low expression (Z-score) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each immune marker. Genes are ordered by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column.


- Expression overview

Overview of immune markers expression profiles in patient’s sample and in samples from cancer patients.

Percentiles

suppressMessages(library(plotly))

##### Generate overview boxplot
if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "immune_genes", type = "perc", sort = "alphabetically", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for immune markers!\n")
}
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
Plot legend

The individual box(es) represent the TEST (TCGA) reference cancer cohort(s), and the BLACK dots indicate expression (percentile) values for each gene in the patient sample. Genes are ordered alphabetically.


Z-scores

suppressMessages(library(plotly))

##### Generate overview boxplot
if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "immune_genes", type = "z", sort = "alphabetically", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for immune markers!\n")
}
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
Plot legend

The individual box(es) represent the TEST (TCGA) reference cancer cohort(s), and the BLACK dots indicate expression (Z-score) values for each gene in the patient sample. Genes are ordered alphabetically.


##### Generate spider web plot to present the patient cancer immunity status. For more info about immunogram see the following papers
# https://www.sciencedirect.com/science/article/pii/S1556086417300084
# https://www.sciencedirect.com/science/article/pii/S1556086417302125
# https://www.europeanurology.com/article/S0302-2838(18)30685-7/fulltext?rss=yes
##### NOTE: currently, the mean expression (Z-score) values of genes from each of the 7 CIC steps are presented rather than the normalized enrichment scores (NES) from GSEA analysis performed for each geneset (CIC step)

##### Preset cancer immunity status for the patient using web-plot
webplot(as.data.frame(ref_genes.list[["genes_immune"]]$immunogram.df), data.row = ncol(data), main = "", add = FALSE, col = "black")

##### Now add data for samples with specific immunogram patterns, e.g. T-cell–rich, T-cell–poor, T-cell–intermediate...
#webplot(as.data.frame(ref_genes.list[["genes_immune"]]$immunogram.df), data.row = 5, main = "", add = TRUE, col = "powderblue", lty = 5)
#webplot(as.data.frame(ref_genes.list[["genes_immune"]]$immunogram.df), data.row = 156, main = "", add = TRUE, col = "forestgreen", lty = 5)
#webplot(as.data.frame(ref_genes.list[["genes_immune"]]$immunogram.df), data.row = 194, main = "", add = TRUE, col = "red", lty = 5)
#legend("topright", legend=c("Patient", "T-cell–rich","T-cell–poor", "T-cell–intermediate"), fill=c("black", "powderblue", "forestgreen", "red"), bty="n", bg = "transparent", cex = 0.8)

#### Clear plots to free up some memory
#if(!is.null(dev.list())) invisible(dev.off())
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]
genes <- unique(unlist(ref_genes.list[["genes_immune"]]$immunogram$SYMBOL))

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes) == 0 ) {
  genes <- NULL
}

##### Generate expression summary table for cancer genes from OncoKB and UMCCr (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
immunogram.expr.perc <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL", "CIC")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "perc", scaling = scaling)[[1]]

##### Present the expression summary table
immunogram.expr.perc

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=immunogram.expr.perc, file=paste(exprTableDir, "immunogram.expr.perc.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(immunogram.expr.perc)
##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
immunogram.expr.z <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL", "CIC")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "z", scaling = scaling)[[1]]

##### Present the expression summary table
immunogram.expr.z

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=immunogram.expr.z, file=paste(exprTableDir, "immunogram.expr.z.html", sep = "/"), selfcontained=TRUE)
}

HRD genes

Section presenting expression levels of homologous recombination deficiency (HRD) genes to assess how many of these demonstrate low expression, which may indicate potential promoter methylation events. Their mRNA expression levels are presented in patient’s sample along their average mRNA expression in samples from cancer cohorts.

Out of the 36 hrd genes the expression of 35 was reliably measured in patient’s sample. The remaining 1 genes are either not expressed or their expression level is too low to be detected (indicated in BLANK cells with missing values).

- Summary table

Percentiles

##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",HRD genes")
mysql_populate_update <- paste0(mysql_populate_update, ",HRD genes")

##### Generate expression summary table for hrd genes from Richqrd
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]
genes <- unique(unlist(ref_genes.list[["genes_hrd"]]$SYMBOL))

##### Deal with no genes
if ( length(genes) == 0 ) {
  genes <- NULL
}

hrd_genes.expr.perc <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "perc", scaling = scaling)

##### Present the expression summary table
hrd_genes.expr.perc[[1]]
##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=hrd_genes.expr.perc[[1]], file=paste(exprTableDir, "hrd_genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
}
Table legend

The RED colour range indicate relatively high expression (percentile) values and BLUE colour range indicate relatively low expression (percentile) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each HRD gene. Genes are ordered by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column. TSG - tumour suppressor gene


Z-scores

##### Generate expression summary table for hrd genes from Richard
hrd_genes.expr.z <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "z", scaling = scaling)

##### Present the expression summary table
hrd_genes.expr.z[[1]]
##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=hrd_genes.expr.z[[1]], file=paste(exprTableDir, "hrd_genes.expr.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(hrd_genes.expr.z)
Table legend

The RED colour range indicate relatively high expression (Z-score) values and BLUE colour range indicate relatively low expression (Z-score) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each HRD gene. Genes are ordered by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column.


- Expression overview

Overview of HRD genes expression profiles in patient’s sample and in samples from cancer patients.

Percentiles

suppressMessages(library(plotly))

##### Generate overview boxplot
if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "hrd_genes", type = "perc", sort = "alphabetically", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for HRD genes!\n")
}
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
Plot legend

The individual box(es) represent the TEST (TCGA) reference cancer cohort(s), and the BLACK dots indicate expression (percentile) values for each gene in the patient sample. Genes are ordered alphabetically.


Z-scores

suppressMessages(library(plotly))

##### Generate overview boxplot
if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "hrd_genes", type = "z", sort = "alphabetically", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for HRD genes!\n")
}
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
Plot legend

The individual box(es) represent the TEST (TCGA) reference cancer cohort(s), and the BLACK dots indicate expression (Z-score) values for each gene in the patient sample. Genes are ordered alphabetically.


Cancer genes

mRNA expression levels of cancer genes in patient’s sample and their average mRNA expression in samples from cancer cohorts. These include genes reported in the following gene panels/resources UMCCR cancer genes, OncoKB, MSK-IMPACT, MSK-HEME, Foundation One, Foundation One Heme, Vogelstein and Sanger Cancer Gene Census (CGC).

- Summary table

Out of the 1315 cancer genes the expression of 1280 was reliably measured in patient’s sample. The remaining 35 genes are either not expressed or their expression level is too low to be detected (indicated in BLANK cells with missing values).

Percentiles

##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",Cancer genes,All genes")
mysql_populate_update <- paste0(mysql_populate_update, ",Cancer genes,All genes")

##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]
genes <- rownames(ref_genes.list[["genes_cancer"]])

##### Deal with no genes
if ( length(genes) == 0 ) {
  genes <- NULL
}

cancer_genes.expr.perc <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]], ext_links = TRUE, type = "perc", scaling = scaling)

##### Present the expression summary table
cancer_genes.expr.perc[[1]]
##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cancer_genes.expr.perc[[1]], file=paste(exprTableDir, "cancer_genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
}
Table legend

The RED colour range indicate relatively high expression (percentile) values and BLUE colour range indicate relatively low expression (percentile) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each cancer gene. Genes considered to be oncogenes or tumour suppressor genes, according to OncoKB database, and inclusion in various sequencing panels are also indicated. Genes are ordered by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column. TSG - tumour suppressor gene


Z-scores

##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
cancer_genes.expr.z <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]], ext_links = TRUE, type = "z", scaling = scaling)[[1]]

##### Present the expression summary table
cancer_genes.expr.z
##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cancer_genes.expr.z, file=paste(exprTableDir, "cancer_genes.expr.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(cancer_genes.expr.z)
Table legend

The RED colour range indicate relatively high expression (Z-score) values and BLUE colour range indicate relatively low expression (Z-score) values in individual sample group. The BLANK cells with missing values indicate genes with no/low expression. The Diff (Patient vs TEST (TCGA)) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each cancer gene. Genes considered to be oncogenes or tumour suppressor genes, according to OncoKB database, and inclusion in various sequencing panels are also indicated. Genes are ordered by decreasing absolute values in the Diff (Patient vs TEST (TCGA)) column. TSG - tumour suppressor gene


- Expression overview

Overview of expression profiles of 50 altered cancer genes with the greatest difference in mRNA expression (percentile) values between patient’s sample and the average mRNA expression in samples from cancer patients.

Percentiles

suppressMessages(library(plotly))

##### Generate overview boxplot
genes <- cancer_genes.expr.perc[[2]]$SYMBOL[1:50]

if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "cancer_genes", type = "perc", sort = "none", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for cancer genes!\n")
}
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
Plot legend

The individual box(es) represent the TEST (TCGA) reference cancer cohort(s), and the BLACK dots indicate expression (percentile) values for each gene in the patient sample. Genes are ordered by decreasing absolute values in the Patient vs TEST (TCGA) comparison.


Z-scores

suppressMessages(library(plotly))

if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "cancer_genes", type = "z", sort = "none", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for cancer genes!\n")
}
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
Plot legend

The individual box(es) represent the TEST (TCGA) reference cancer cohort(s), and the BLACK dots indicate expression (Z-score) values for each gene in the patient sample. Genes are ordered by decreasing absolute values in the Patient vs TEST (TCGA) comparison.


##### Create directory for tables
drugsTableDir <- paste(results_dir, "drugsTables", sep = "/")
if ( !file.exists(drugsTableDir) ) {
  dir.create(drugsTableDir, recursive=TRUE)
}
##### Generate table with drugs targeting mutated cancer genes
genes <- mut_genes.expr.perc[[2]]$SYMBOL

drugsTable.mut_genes <- civicDrugTable(genes, civic_var_summaries = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "mutation")

if ( params$drugs ) {
  drugsTable.mut_genes[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.mut_genes[[1]], file=paste(drugsTableDir, "drugsTable.mut_genes.html", sep = "/"), selfcontained=TRUE)
}
##### Generate table with drugs targeting fusion genes
genesA <- as.vector(fusions[ fusion_annot$reported_fusion == "Yes" | fusion_annot$geneA_dna_support == "Yes" | fusion_annot$geneB_dna_support == "Yes", ]$geneA)
genesB <- as.vector(fusions[ fusion_annot$reported_fusion == "Yes" | fusion_annot$geneA_dna_support == "Yes" | fusion_annot$geneB_dna_support == "Yes", ]$geneB)

drugsTable.fusion_genes <- civicDrugTable(genes = unique(c(genesA, genesB)), civic_var_summaries  = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid  = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "fusion")

if ( params$drugs ) {
  drugsTable.fusion_genes[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.fusion_genes[[1]], file=paste(drugsTableDir, "drugsTable.fusion_genes.html", sep = "/"), selfcontained=TRUE)
}
##### Generate table with drugs targeting dysregulated cancer genes
genes <- unique(manta_sv$Gene)

drugsTable.sv_genes <- civicDrugTable(genes, civic_var_summaries  = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid  = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = NULL)

if ( params$drugs ) {
  drugsTable.sv_genes[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.sv_genes[[1]], file=paste(drugsTableDir, "drugsTable.sv_genes.html", sep = "/"), selfcontained=TRUE)
}
##### Generate table with drugs targeting CN altered genes
genes <- cn_expr_genes.expr.gains.perc[[2]]$SYMBOL

drugsTable.CN_altered_genes_gains <- civicDrugTable(genes, civic_var_summaries  = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid  = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "copy_gain")

if ( params$drugs ) {
  drugsTable.CN_altered_genes_gains[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.CN_altered_genes_gains[[1]], file=paste(drugsTableDir, "drugsTable.CN_altered_genes_gains.html", sep = "/"), selfcontained=TRUE)
}
##### Generate table with drugs targeting CN altered genes
genes <- cn_expr_genes.expr.losses.perc[[2]]$SYMBOL

drugsTable.CN_altered_genes_losses <- civicDrugTable(genes, civic_var_summaries  = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid  = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "copy_loss")

if ( params$drugs ) {
  drugsTable.CN_altered_genes_losses[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.CN_altered_genes_losses[[1]], file=paste(drugsTableDir, "drugsTable.CN_altered_genes_losses.html", sep = "/"), selfcontained=TRUE)
}
##### Generate table with drugs targeting mutated cancer genes
genes <- hrd_genes.expr.perc[[2]]$SYMBOL[ hrd_genes.expr.perc[[2]]$SYMBOL %in% rownames(ref_dataset.list[[dataset]][["data_to_report"]]) ]

drugsTable.hrd_genes <- civicDrugTable(genes, civic_var_summaries = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "mutation")

if ( params$drugs ) {
  drugsTable.hrd_genes[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.hrd_genes[[1]], file=paste(drugsTableDir, "drugsTable.hrd_genes.html", sep = "/"), selfcontained=TRUE)
}
##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",Drug matching")
mysql_populate_update <- paste0(mysql_populate_update, ",Drug matching")

##### Generate table with drugs targeting dysregulated cancer genes
genes <- cancer_genes.expr.perc[[2]]$SYMBOL[1:50]

drugsTable.cancer_genes <- civicDrugTable(genes, civic_var_summaries  = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid  = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "expression")

if ( params$drugs ) {
  drugsTable.cancer_genes[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.cancer_genes[[1]], file=paste(drugsTableDir, "drugsTable.cancer_genes.html", sep = "/"), selfcontained=TRUE)
}
##### Finalise and write into a file the MySQL commend to populate RNA-seq data portal
##### Add input data info
mysql_populate <- paste0(mysql_populate, ",Input data")
mysql_populate_update <- paste0(mysql_populate_update, ",Input data")

##### Add clinical data if available
if (runClinicalChunk ) {
  mysql_populate <- paste0(mysql_populate, ",Clinical information")
  mysql_populate_update <- paste0(mysql_populate_update, ",Clinical information")
}

mysql_populate <- paste0(mysql_populate, "\", \"Transcriptome summary for sample ", sample_name, " generated on ", Sys.Date(), "\"", ", \"", Sys.Date(), "\" )")
mysql_populate_update <- paste0(mysql_populate_update, "\", Summary=\"Transcriptome summary for sample ", sample_name, " generated on ", Sys.Date(), "\"", ", Date=\"", Sys.Date(), "\";")
mysql_populate <- paste0(mysql_populate, "\n  ", mysql_populate_update, "\nSET @ID := 0;\nUPDATE RNAseq_reports SET ID = ( SELECT @ID := @ID + 1 );")
writeLines(mysql_populate, con = paste0(results_dir, "/", sample_name, ".RNAseq_report.sql"))

Addendum

Parameters

for ( i in 1:length(params) ) {
  cat(paste("Parameter: ", names(params)[i], "\nValue: ", paste(unlist(params[i]), collapse = ","), "\n\n", sep=""))
}
Parameter: annot_file
Value: genes/tx2gene.ensembl.v95.csv

Parameter: genes_cancer
Value: genes/umccr_cancer_genes.2019-03-20.tsv

Parameter: genes_immunogram
Value: genes/Genes_immunogram.txt

Parameter: genes_immune_markers
Value: genes/Genes_immune_markers.txt

Parameter: genes_hrd
Value: genes/Genes_HRD.txt

Parameter: oncokb_genes
Value: OncoKB/CancerGenesList.txt

Parameter: oncokb_clin_vars
Value: OncoKB/allActionableVariants.txt

Parameter: oncokb_all_vars
Value: OncoKB/allAnnotatedVariants.txt

Parameter: civic_var_summaries
Value: CIViC/01-Oct-2018-VariantSummaries.tsv

Parameter: civic_clin_evid
Value: CIViC/01-Oct-2018-ClinicalEvidenceSummaries.tsv

Parameter: cancer_biomarkers_trans
Value: cancer_biomarkers_database/cancer_genes_upon_trans.tsv

Parameter: FusionGDB
Value: FusionGDB/TCGA_ChiTaRS_combined_fusion_ORF_analyzed_gencode_h19v19_fgID.txt

Parameter: sample_name
Value: test_sample_WTS

Parameter: dataset
Value: TEST

Parameter: bcbio_rnaseq
Value: 

Parameter: dragen_rnaseq
Value: /Users/kanwals/UMCCR/git/RNAsum/data/test_data/stratus/test_sample_WTS_dragen_v3.9.3

Parameter: report_dir
Value: /Users/kanwals/UMCCR/git/RNAsum/rmd_files/../data/test_data/stratus/test_sample_WTS/RNAsum

Parameter: ref_data_dir
Value: ../data

Parameter: transform
Value: CPM

Parameter: norm
Value: TMM

Parameter: batch_rm
Value: TRUE

Parameter: filter
Value: TRUE

Parameter: log
Value: TRUE

Parameter: scaling
Value: gene-wise

Parameter: drugs
Value: FALSE

Parameter: immunogram
Value: FALSE

Parameter: umccrise
Value: 

Parameter: clinical_info
Value: NA

Parameter: clinical_id
Value: NA

Parameter: subject_id
Value: NA

Parameter: sample_source
Value: -

Parameter: dataset_name_incl
Value: 

Parameter: project
Value: -

Parameter: save_tables
Value: TRUE

Parameter: pcgr_tier
Value: 4

Parameter: pcgr_splice_vars
Value: TRUE

Parameter: cn_loss
Value: 5

Parameter: cn_gain
Value: 95

Parameter: top_genes
Value: 5

Parameter: hide_code_btn
Value: TRUE

Parameter: grch_version
Value: 38

Parameter: ensembl_version
Value: 86

Parameter: ucsc_genome_assembly
Value: 38
Reporter details

cat(paste0("The report was generated by \"", Sys.info()[ "user"], "\" using \"",  Sys.info()[ "nodename"], "\" node and \"",  Sys.info()[ "sysname"], "\" operating system."))
The report was generated by "kanwals" using "9999L-204093-M" node and "Darwin" operating system.
Session information

devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value                       
 version  R version 3.6.3 (2020-02-29)
 os       macOS  10.16                
 system   x86_64, darwin13.4.0        
 ui       unknown                     
 language (EN)                        
 collate  en_AU.UTF-8                 
 ctype    en_AU.UTF-8                 
 tz       Australia/Melbourne         
 date     2022-01-21                  

─ Packages ───────────────────────────────────────────────────────────────────
 package                     * version  date       lib source        
 annotate                      1.64.0   2019-10-29 [1] Bioconductor  
 AnnotationDbi               * 1.48.0   2019-10-29 [1] Bioconductor  
 AnnotationFilter            * 1.10.0   2019-10-29 [1] Bioconductor  
 aroma.light                   3.16.0   2019-10-29 [1] Bioconductor  
 askpass                       1.1      2019-01-13 [1] CRAN (R 3.6.3)
 assertthat                    0.2.1    2019-03-21 [1] CRAN (R 3.6.3)
 audio                         0.1-6    2019-03-19 [1] CRAN (R 3.6.1)
 backports                     1.2.1    2020-12-09 [1] CRAN (R 3.6.3)
 beepr                         1.3      2018-06-04 [1] CRAN (R 3.6.1)
 Biobase                     * 2.46.0   2019-10-29 [1] Bioconductor  
 BiocFileCache                 1.10.0   2019-10-29 [1] Bioconductor  
 BiocGenerics                * 0.32.0   2019-10-29 [1] Bioconductor  
 BiocParallel                * 1.20.0   2019-10-30 [1] Bioconductor  
 biomaRt                       2.42.0   2019-10-29 [1] Bioconductor  
 Biostrings                  * 2.54.0   2019-10-29 [1] Bioconductor  
 bit                           4.0.4    2020-08-04 [1] CRAN (R 3.6.3)
 bit64                         4.0.5    2020-08-30 [1] CRAN (R 3.6.3)
 bitops                        1.0-7    2021-04-24 [1] CRAN (R 3.6.3)
 blob                          1.2.1    2020-01-20 [1] CRAN (R 3.6.3)
 broom                         0.7.6    2021-04-05 [1] CRAN (R 3.6.3)
 BSgenome                    * 1.54.0   2019-10-29 [1] Bioconductor  
 BSgenome.Hsapiens.UCSC.hg38 * 1.4.1    2021-12-10 [1] Bioconductor  
 cachem                        1.0.5    2021-05-15 [1] CRAN (R 3.6.3)
 callr                         3.7.0    2021-04-20 [1] CRAN (R 3.6.3)
 cellranger                    1.1.0    2016-07-27 [1] CRAN (R 3.6.3)
 cli                           2.5.0    2021-04-26 [1] CRAN (R 3.6.3)
 colorspace                    2.0-1    2021-05-04 [1] CRAN (R 3.6.3)
 config                        0.3.1    2020-12-17 [1] CRAN (R 3.6.3)
 crayon                        1.4.1    2021-02-08 [1] CRAN (R 3.6.3)
 crosstalk                     1.1.1    2021-01-12 [1] CRAN (R 3.6.3)
 curl                          4.3.1    2021-04-30 [1] CRAN (R 3.6.3)
 data.table                    1.14.0   2021-02-21 [1] CRAN (R 3.6.3)
 DBI                           1.1.1    2021-01-15 [1] CRAN (R 3.6.3)
 dbplyr                        2.1.1    2021-04-06 [1] CRAN (R 3.6.3)
 DelayedArray                * 0.12.0   2019-10-29 [1] Bioconductor  
 desc                          1.3.0    2021-03-05 [1] CRAN (R 3.6.3)
 DESeq                         1.38.0   2019-10-29 [1] Bioconductor  
 devtools                    * 2.4.1    2021-05-05 [1] CRAN (R 3.6.3)
 digest                        0.6.27   2020-10-24 [1] CRAN (R 3.6.3)
 dplyr                       * 1.0.6    2021-05-05 [1] CRAN (R 3.6.3)
 DT                          * 0.18     2021-04-14 [1] CRAN (R 3.6.3)
 EDASeq                      * 2.20.0   2019-10-29 [1] Bioconductor  
 edgeR                       * 3.28.0   2019-10-29 [1] Bioconductor  
 ellipsis                      0.3.2    2021-04-29 [1] CRAN (R 3.6.3)
 EnsDb.Hsapiens.v86          * 2.99.0   2021-12-10 [1] Bioconductor  
 ensembldb                   * 2.10.0   2019-10-29 [1] Bioconductor  
 evaluate                      0.14     2019-05-28 [1] CRAN (R 3.6.3)
 fansi                         0.4.2    2021-01-15 [1] CRAN (R 3.6.3)
 farver                        2.1.0    2021-02-28 [1] CRAN (R 3.6.3)
 fastmap                       1.1.0    2021-01-25 [1] CRAN (R 3.6.3)
 forcats                     * 0.5.1    2021-01-27 [1] CRAN (R 3.6.3)
 forecast                      8.14     2021-03-11 [1] CRAN (R 3.6.3)
 fracdiff                      1.5-1    2020-01-24 [1] CRAN (R 3.6.3)
 fs                            1.5.0    2020-07-31 [1] CRAN (R 3.6.3)
 gargle                        1.1.0    2021-04-02 [1] CRAN (R 3.6.3)
 genefilter                    1.68.0   2019-10-29 [1] Bioconductor  
 geneplotter                   1.64.0   2019-10-29 [1] Bioconductor  
 generics                      0.1.0    2020-10-31 [1] CRAN (R 3.6.3)
 GenomeInfoDb                * 1.22.0   2019-10-29 [1] Bioconductor  
 GenomeInfoDbData              1.2.2    2021-12-10 [1] Bioconductor  
 GenomicAlignments           * 1.22.0   2019-10-29 [1] Bioconductor  
 GenomicFeatures             * 1.38.0   2019-10-29 [1] Bioconductor  
 GenomicRanges               * 1.38.0   2019-10-29 [1] Bioconductor  
 getopt                        1.20.3   2019-03-22 [1] CRAN (R 3.6.3)
 ggforce                     * 0.3.3    2021-03-05 [1] CRAN (R 3.6.3)
 ggplot2                     * 3.3.3    2020-12-30 [1] CRAN (R 3.6.3)
 glue                        * 1.4.2    2020-08-27 [1] CRAN (R 3.6.3)
 googleAuthR                   1.4.0    2021-04-02 [1] CRAN (R 3.6.3)
 googlesheets                  0.3.0    2019-10-11 [1] local         
 gridExtra                     2.3      2017-09-09 [1] CRAN (R 3.6.3)
 gtable                        0.3.0    2019-03-25 [1] CRAN (R 3.6.3)
 h2o                           3.26.0.2 2019-08-01 [1] CRAN (R 3.6.1)
 haven                         2.4.1    2021-04-23 [1] CRAN (R 3.6.3)
 highr                         0.9      2021-04-16 [1] CRAN (R 3.6.3)
 hms                           1.1.0    2021-05-17 [1] CRAN (R 3.6.3)
 htmltools                   * 0.5.1.1  2021-01-22 [1] CRAN (R 3.6.3)
 htmlwidgets                 * 1.5.3    2020-12-10 [1] CRAN (R 3.6.3)
 httr                          1.4.2    2020-07-20 [1] CRAN (R 3.6.3)
 hwriter                       1.3.2    2014-09-10 [1] CRAN (R 3.6.3)
 IRanges                     * 2.20.0   2019-10-29 [1] Bioconductor  
 jpeg                          0.1-8.1  2019-10-24 [1] CRAN (R 3.6.3)
 jsonlite                      1.7.2    2020-12-09 [1] CRAN (R 3.6.3)
 knitr                       * 1.33     2021-04-24 [1] CRAN (R 3.6.3)
 lares                       * 4.7      2019-10-11 [1] local         
 lattice                       0.20-44  2021-05-02 [1] CRAN (R 3.6.3)
 latticeExtra                  0.6-29   2019-12-19 [1] CRAN (R 3.6.3)
 lazyeval                      0.2.2    2019-03-15 [1] CRAN (R 3.6.3)
 lifecycle                     1.0.0    2021-02-15 [1] CRAN (R 3.6.3)
 limma                       * 3.42.0   2019-10-29 [1] Bioconductor  
 lmtest                        0.9-38   2020-09-09 [1] CRAN (R 3.6.3)
 locfit                        1.5-9.4  2020-03-25 [1] CRAN (R 3.6.3)
 lubridate                     1.7.10   2021-02-26 [1] CRAN (R 3.6.3)
 magrittr                      2.0.1    2020-11-17 [1] CRAN (R 3.6.3)
 MASS                          7.3-54   2021-05-03 [1] CRAN (R 3.6.3)
 Matrix                        1.3-3    2021-05-04 [1] CRAN (R 3.6.3)
 matrixStats                 * 0.58.0   2021-01-29 [1] CRAN (R 3.6.3)
 memoise                       2.0.0    2021-01-26 [1] CRAN (R 3.6.3)
 mice                          3.13.0   2021-01-27 [1] CRAN (R 3.6.3)
 modelr                        0.1.8    2020-05-19 [1] CRAN (R 3.6.3)
 munsell                       0.5.0    2018-06-12 [1] CRAN (R 3.6.3)
 nlme                          3.1-150  2020-10-24 [1] CRAN (R 3.6.3)
 NLP                           0.2-1    2020-10-14 [1] CRAN (R 3.6.3)
 nnet                          7.3-16   2021-05-03 [1] CRAN (R 3.6.3)
 openssl                       1.4.4    2021-04-30 [1] CRAN (R 3.6.3)
 openxlsx                    * 4.2.3    2020-10-27 [1] CRAN (R 3.6.3)
 optparse                    * 1.6.6    2020-04-16 [1] CRAN (R 3.6.3)
 pander                        0.6.3    2018-11-06 [1] CRAN (R 3.6.3)
 pdftools                    * 3.0.1    2021-05-06 [1] CRAN (R 3.6.3)
 pillar                        1.6.1    2021-05-16 [1] CRAN (R 3.6.3)
 pkgbuild                      1.2.0    2020-12-15 [1] CRAN (R 3.6.3)
 pkgconfig                     2.0.3    2019-09-22 [1] CRAN (R 3.6.3)
 pkgload                       1.2.1    2021-04-06 [1] CRAN (R 3.6.3)
 plotly                        4.9.0    2019-04-10 [1] CRAN (R 3.6.1)
 plyr                          1.8.6    2020-03-03 [1] CRAN (R 3.6.3)
 png                         * 0.1-7    2013-12-03 [1] CRAN (R 3.6.3)
 polyclip                      1.10-0   2019-03-14 [1] CRAN (R 3.6.3)
 preprocessCore              * 1.48.0   2019-10-29 [1] Bioconductor  
 prettyunits                   1.1.1    2020-01-24 [1] CRAN (R 3.6.3)
 pROC                          1.17.0.1 2021-01-13 [1] CRAN (R 3.6.3)
 processx                      3.5.2    2021-04-30 [1] CRAN (R 3.6.3)
 progress                      1.2.2    2019-05-16 [1] CRAN (R 3.6.3)
 ProtGenerics                  1.18.0   2019-10-29 [1] Bioconductor  
 ps                            1.6.0    2021-02-28 [1] CRAN (R 3.6.3)
 purrr                       * 0.3.4    2020-04-17 [1] CRAN (R 3.6.3)
 qpdf                          1.1      2019-03-07 [1] CRAN (R 3.6.3)
 quadprog                      1.5-8    2019-11-20 [1] CRAN (R 3.6.3)
 quantmod                      0.4.18   2020-12-09 [1] CRAN (R 3.6.3)
 R.methodsS3                   1.8.1    2020-08-26 [1] CRAN (R 3.6.3)
 R.oo                          1.24.0   2020-08-26 [1] CRAN (R 3.6.3)
 R.utils                       2.10.1   2020-08-26 [1] CRAN (R 3.6.3)
 R6                            2.5.0    2020-10-28 [1] CRAN (R 3.6.3)
 rappdirs                      0.3.3    2021-01-31 [1] CRAN (R 3.6.3)
 rapportools                 * 1.0      2014-01-07 [1] CRAN (R 3.6.3)
 RCircos                     * 1.2.1    2019-03-12 [1] CRAN (R 3.6.3)
 RColorBrewer                  1.1-2    2014-12-07 [1] CRAN (R 3.6.3)
 Rcpp                          1.0.6    2021-01-15 [1] CRAN (R 3.6.3)
 RCurl                         1.98-1.3 2021-03-16 [1] CRAN (R 3.6.3)
 rdrop2                        0.8.2.1  2020-08-05 [1] CRAN (R 3.6.3)
 readr                       * 1.4.0    2020-10-05 [1] CRAN (R 3.6.3)
 readxl                        1.3.1    2019-03-13 [1] CRAN (R 3.6.3)
 remotes                       2.3.0    2021-04-01 [1] CRAN (R 3.6.3)
 reprex                        2.0.0    2021-04-02 [1] CRAN (R 3.6.3)
 reshape                     * 0.8.8    2018-10-23 [1] CRAN (R 3.6.3)
 reshape2                      1.4.4    2020-04-09 [1] CRAN (R 3.6.3)
 rhdf5                       * 2.30.0   2019-10-29 [1] Bioconductor  
 Rhdf5lib                      1.8.0    2019-10-29 [1] Bioconductor  
 rlang                       * 0.4.11   2021-04-30 [1] CRAN (R 3.6.3)
 rlist                         0.4.6.1  2016-04-04 [1] CRAN (R 3.6.3)
 rmarkdown                     2.8      2021-05-07 [1] CRAN (R 3.6.3)
 rprojroot                     2.0.2    2020-11-15 [1] CRAN (R 3.6.3)
 Rsamtools                   * 2.2.0    2019-10-29 [1] Bioconductor  
 RSQLite                       2.2.5    2021-03-27 [1] CRAN (R 3.6.3)
 rstudioapi                    0.13     2020-11-12 [1] CRAN (R 3.6.3)
 rtracklayer                 * 1.46.0   2019-10-29 [1] Bioconductor  
 rvest                         1.0.0    2021-03-09 [1] CRAN (R 3.6.3)
 S4Vectors                   * 0.24.0   2019-10-29 [1] Bioconductor  
 scales                      * 1.1.1    2020-05-11 [1] CRAN (R 3.6.3)
 sessioninfo                   1.1.1    2018-11-05 [1] CRAN (R 3.6.3)
 ShortRead                   * 1.44.0   2019-10-29 [1] Bioconductor  
 slam                          0.1-48   2020-12-03 [1] CRAN (R 3.6.3)
 sp                            1.4-5    2021-01-10 [1] CRAN (R 3.6.3)
 stringi                       1.5.3    2020-09-09 [1] CRAN (R 3.6.3)
 stringr                     * 1.4.0    2019-02-10 [1] CRAN (R 3.6.3)
 SummarizedExperiment        * 1.16.0   2019-10-29 [1] Bioconductor  
 survival                      3.2-10   2021-03-16 [1] CRAN (R 3.6.3)
 testthat                      3.0.2    2021-02-14 [1] CRAN (R 3.6.3)
 tibble                      * 3.1.2    2021-05-16 [1] CRAN (R 3.6.3)
 tidyr                       * 1.1.3    2021-03-03 [1] CRAN (R 3.6.3)
 tidyselect                    1.1.1    2021-04-30 [1] CRAN (R 3.6.3)
 tidyverse                   * 1.3.1    2021-04-15 [1] CRAN (R 3.6.3)
 timeDate                      3043.102 2018-02-21 [1] CRAN (R 3.6.3)
 tm                            0.7-8    2020-11-18 [1] CRAN (R 3.6.3)
 tseries                       0.10-47  2019-06-05 [1] CRAN (R 3.6.3)
 TTR                           0.24.2   2020-09-01 [1] CRAN (R 3.6.3)
 tweenr                        1.0.2    2021-03-23 [1] CRAN (R 3.6.3)
 tximport                    * 1.14.0   2019-10-29 [1] Bioconductor  
 urca                          1.3-0    2016-09-06 [1] CRAN (R 3.6.3)
 usethis                     * 2.0.1    2021-02-10 [1] CRAN (R 3.6.3)
 utf8                          1.2.1    2021-03-12 [1] CRAN (R 3.6.3)
 vctrs                         0.3.8    2021-04-29 [1] CRAN (R 3.6.3)
 viridisLite                   0.4.0    2021-04-13 [1] CRAN (R 3.6.3)
 withr                         2.4.2    2021-04-18 [1] CRAN (R 3.6.3)
 wordcloud                     2.6      2018-08-24 [1] CRAN (R 3.6.3)
 xfun                          0.23     2021-05-15 [1] CRAN (R 3.6.3)
 XML                           3.99-0.3 2020-01-20 [1] CRAN (R 3.6.3)
 xml2                          1.3.2    2020-04-23 [1] CRAN (R 3.6.3)
 xtable                        1.8-4    2019-04-21 [1] CRAN (R 3.6.3)
 xts                           0.12-0   2020-01-19 [1] CRAN (R 3.6.3)
 XVector                     * 0.26.0   2019-10-29 [1] Bioconductor  
 yaml                          2.2.1    2020-02-01 [1] CRAN (R 3.6.3)
 zip                           2.1.1    2020-08-27 [1] CRAN (R 3.6.3)
 zlibbioc                      1.32.0   2019-10-29 [1] Bioconductor  
 zoo                           1.8-9    2021-03-09 [1] CRAN (R 3.6.3)

[1] /Users/kanwals/miniconda/envs/rnasum/lib/R/library
---
title: 'Patient Transcriptome Summary'
author: 'UMCCR'
date: '`r Sys.Date()`'
output:
  html_document:
    keep_md: yes
    code_download: true
    code_folding: hide
    theme: readable
    css: RNAseq_report.css
    toc: true
    toc_float: true
  rmdformats::material:
    highlight: kate
params:
  sample_name: 'test_sample_WTS'
  dataset: 'TEST'
  bcbio_rnaseq: '../data/test_data/final/test_sample_WTS'
  dragen_rnaseq: NULL
  report_dir: '../data/test_data/final/test_sample_WTS/RNAsum'
  ref_data_dir: '../data'
  transform: 'CPM'
  filter: TRUE
  norm: 'TMM'
  batch_rm: TRUE
  log: TRUE
  scaling: 'gene-wise'
  drugs: FALSE
  immunogram: FALSE
  umccrise: '../data/test_data/umccrised/test_subject__test_sample_WGS'
  clinical_info: '../data/test_data/test_clinical_data.xlsx'
  clinical_id: 'test_subject'
  subject_id: ''
  sample_source: "-"
  dataset_name_incl: ''
  project: "-"
  top_genes: 5
  save_tables: TRUE
  hide_code_btn: TRUE
  pcgr_tier: 4
  pcgr_splice_vars: TRUE
  cn_loss: 5
  cn_gain: 95
  grch_version: 38
  ensembl_version: 86
  ucsc_genome_assembly: 38
  annot_file: 'genes/tx2gene.ensembl.v95.csv'
  genes_cancer: 'genes/umccr_cancer_genes.2019-03-20.tsv'
  genes_immunogram: 'genes/Genes_immunogram.txt'
  genes_immune_markers: 'genes/Genes_immune_markers.txt'
  genes_hrd: 'genes/Genes_HRD.txt'
  oncokb_genes: 'OncoKB/CancerGenesList.txt'
  oncokb_clin_vars: 'OncoKB/allActionableVariants.txt'
  oncokb_all_vars: 'OncoKB/allAnnotatedVariants.txt'
  civic_var_summaries: 'CIViC/01-Oct-2018-VariantSummaries.tsv'
  civic_clin_evid: 'CIViC/01-Oct-2018-ClinicalEvidenceSummaries.tsv'
  cancer_biomarkers_trans: 'cancer_biomarkers_database/cancer_genes_upon_trans.tsv'
  FusionGDB: 'FusionGDB/TCGA_ChiTaRS_combined_fusion_ORF_analyzed_gencode_h19v19_fgID.txt'
---

Transcriptome summary for patient sample **`r paste0(params$sample_name, params$dataset_name_incl)`**.


```{r script_description, eval=FALSE}
##### We attempt to structure the script in the following way:
# 1. Defining functions
# 2. Loading libraries
# 3. Loading sample data and reference datasets
# Then... code chunks involving data processing
# Then... code chunks calling the processed data to produce tables / plots / data summary
# Finish with Session info in Addendum section

##### The processed data is stored in "ref_dataset.list" list variable with elements holding the following data:
# 1. ref_dataset.list[[dataset]][["combined_data"]] = combined read count data (reference datasets + sample data) ("combineDatasets" function output in the "load_ref_data chunk")
# 2. ref_dataset.list[[dataset]][["sample_annot"]] = combined data samples annotation ("combineDatasets" function output in the "load_ref_data chunk")
# 3. ref_dataset.list[[dataset]][["clinical_info"]] = clinical information (survival and treatment info)
# 4. ref_dataset.list[[dataset]][["combined_data_processed"]] = transformed, filtered and normalised data (see "data_transformation" and "data_normalisation" chunks)
# 5. ref_dataset.list[[dataset]][["batch_effect_corrected"]] = transformed, filtered, normalised and batch effect corrected data (see "batch_effect_correction" chunk)
# 6. ref_dataset.list[[dataset]][["pca_combined_data_processed"]] = PCA results for combined data
# 7. ref_dataset.list[[dataset]][["pca_batch_effect_corrected"]] = PCA results for batch-effect corrected data
# 8. ref_dataset.list[[dataset]][["rle_combined_data_processed"]] = RLE plot for combined data
# 9. ref_dataset.list[[dataset]][["rle_batch_effect_corrected"]] = RLE plot for batch-effect corrected data
# 10. ref_dataset.list[[dataset]][["data_to_report"]] = Fully combined and processed data to be used for reporting
# 11. ref_dataset.list[[dataset]][["gene_annot_all"]] = gene annotation for combined read count data, containing all input genes. The annotation includes "SYMBOL", "GENEBIOTYPE", "ENSEMBL", "SEQNAME", "GENESEQSTART", "GENESEQEND". "ENSEMBL" is used for rownames
# 12. ref_dataset.list[[dataset]][["gene_annot"]] = gene annotation for transformed, filtered and normalised data. The annotation includes "SYMBOL", "GENEBIOTYPE", "ENSEMBL", "SEQNAME", "GENESEQSTART", "GENESEQEND". "SYMBOL" is used for rownames
# 13. ref_dataset.list[[dataset]][["expr_mut_cn_data_all"]] = combined expression, mutation and copy-number data
# 14. ref_dataset.list[[dataset]][["expr_mut_cn_data"]] = combined expression, mutation and copy-number data limited to cancer genes that meet user-deinfed CN values threshold

##### Genes of interest are stored in "ref_genes.list" list variable with elements holding the following gene sets:
# 1. ref_genes.list[["genes_cancer"]] = list of cancer genes derived from UMCCR Cancer Gene list (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv) and OncoKB portal (http://oncokb.org/#/cancerGenes) 
# 2. ref_genes.list[["genes_oncokb"]] = list of cancer genes derived from OncoKB portal (http://oncokb.org/#/cancerGenes) alone (although genes present on the UMCCR panel are also flagged)
# 3. ref_genes.list[["genes_immune"]] = list of immune reponse markers provided in the "An Immunogram for the Cancer-Immunity Cycle" paper by Karasaki at al (2017) (https://www.ncbi.nlm.nih.gov/pubmed/28088513) and OmniSeq report (https://www.omniseq.com/) 
# 4. ref_genes.list[["genes_hrd"]] = list of hrd (homologous recombination deficiency) genes
# 5. ref_genes.list[["pcgr"]] = list and PCGR annotation of mutated genes in given patient based on PCGR report
# 6. ref_genes.list[["purple"]] = list and PURPLE annotation of copy-number (CN) altered genes in given patient based on PURPLE results
# 7. ref_genes.list[["manta"]] = list and MANTA annotation of structural variants (SVs) with affected genes in given patient based on MANTA results
# 8. ref_genes.list[["arriba"]] = list and ARRIBA annotation of gene fusion events detected in given patient based on ARRIBA results
# 9. ref_genes.list[["pizzly"]] = list and PIZZLY annotation of gene fusion events detected in given patient based on PIZZLY results
# 10. ref_genes.list[["summary"]] = summary of above-mentioned gene lists. These gene lists are also used for generating expression summary tables and plots in individual report sections
```

```{r code_display, echo = FALSE}
##### Include or exclude the "Code" buttom allowing to "show"/"hide" code chunks from the report 
if ( params$hide_code_btn ) {
  writeLines(".btn { display: none ;", con = "RNAseq_report.css")
} else {
  writeLines(" ", con = "RNAseq_report.css")
}
```
  
```{r chunks_timing, comment=NA, message=FALSE, warning=FALSE}
NOW <- Sys.time()

##### Time chunks during knitting
knitr::knit_hooks$set(timeit = function(before) {
  
  if (before) {
    print(paste("Start:", Sys.time()))
    NOW <<- Sys.time()
  } else {
    print(paste("Stop:", Sys.time()))
    print(Sys.time() - NOW)
  }
})

knitr::opts_chunk$set(timeit = TRUE)
```

```{r define_functions, comment=NA, message=FALSE, warning=FALSE}
##### Define functions
##### Create 'not in' operator
"%!in%" <- function(x,table) match(x,table, nomatch = 0) == 0

##### Prepare object to write into a file
prepare2write <- function (x) {
  
  x2write <- cbind(rownames(x), x)
  colnames(x2write) <- c("",colnames(x))
  
  ##### Clean the space and return output
  rm(x)
  return(x2write)
}

##### Combine sample expression profile with reference datasets. This function outputs a vector with first element containing the merged data and second element containing merged targets info
combineDatasets <- function(sample_name, sample_counts, ref_data, report_dir, dataset) {
  
  ##### Extract info about target file for the external reference dataset
  target.ext <- read.table(ref_data[["ext_ref"]][2], sep="\t", as.is=TRUE, header=TRUE)
  target.ext <- cbind(target.ext, rep(ref_data[["ext_ref"]][3], nrow(target.ext)))
  colnames(target.ext)[ncol(target.ext)] <- "Dataset"
  
  ##### Add prexit to sample names
  rownames(target.ext) <- paste(target.ext[,"Dataset"], target.ext[,"Sample_name"], sep = ".")
  target.ext <- target.ext[, -1]
  
  ##### Extract info about target file for the internal reference dataset
  target.int <- read.table(ref_data[["int_ref"]][2], sep="\t", as.is=TRUE, header=TRUE)
  target.int <- cbind(target.int, rep(ref_data[["int_ref"]][3], nrow(target.int)))
  colnames(target.int)[ncol(target.int)] <- "Dataset"
      
  ##### Add prexit to sample names
  rownames(target.int) <- paste(target.int[,"Dataset"], target.int[,"Sample_name"], sep = ".")
  target.int <- target.int[, -1]
      
  target.comb <- rbind(target.ext, target.int)
  
  ##### Add sample info
  target.sample <- data.frame(sample_name, sample_name)
  names(target.sample) <- names(target.comb)
  rownames(target.sample) <- sample_name
  target.comb <- rbind( target.comb, target.sample )
  
  ##### Make syntactically valid names
  rownames(target.comb) <- make.names(rownames(target.comb))
  
  ##### Read sample read count file and combine it with reference datasets
  datasets.comb <- sample_counts
  names(datasets.comb) <- c("", sample_name)
      
  ##### list genes present in the sample read count file
  gene_list <- as.vector(datasets.comb[,1])
      
  ##### Loop through the expression data from different datasets and merge them into one matrix
  for ( i in 1:length(ref_data) ) {
    
    dataset.counts <- as.data.frame( read.table(gzfile(ref_data[[i]][1]), header=TRUE, sep="\t", row.names=NULL) )
    
    ##### Add prexit to sample names
    colnames(dataset.counts) <- paste(unique(target.comb[,"Dataset"])[i], colnames(dataset.counts), sep = ".")
    
    ##### List genes present in individal files
    gene_list <- c( gene_list, as.vector(dataset.counts[,1]) )
    
    ##### Merge the expression datasets and make sure that the genes order is the same
    datasets.comb <- merge( datasets.comb, dataset.counts, by=1, all = FALSE, sort= TRUE)
  }
  
  ##### Use gene IDs as rownames
  rownames(datasets.comb) <- datasets.comb[,1]
  datasets.comb <- datasets.comb[, -1]
  
  ##### Make syntactically valid names
  colnames(datasets.comb) <- make.names(colnames(datasets.comb))
  
  ##### Make sure that the target file contains info only about samples present in the data matrix
  target.comb <- target.comb[ rownames(target.comb) %in% colnames(datasets.comb),  ]
  
  ##### Make sure that the samples order in the data matrix is the same as in the target file 
  datasets.comb <- datasets.comb[ , rownames(target.comb) ]
  
  ##### Identify genes that were not present across all per-sampel files and were ommited in the merged matrix
  gene_list <- unique(gene_list)
  gene_list.missing <- gene_list[ gene_list %!in% rownames(datasets.comb) ]
  
  ##### Write list of missing genes into a file
  if ( length(gene_list.missing) > 0 ) {
    write.table(prepare2write(gene_list.missing), file = paste0(report_dir, "/", sample_name, ".RNAseq_report.missing_genes.txt"), sep="\t", quote=FALSE, row.names=TRUE, append = FALSE )
  }
  
  ##### Clean the space and return output
  rm(sample_name, sample_counts, ref_data, target.ext, target.int, target.sample, dataset.counts, gene_list, gene_list.missing)
  return( list(datasets.comb, target.comb) )
}

##### Assign colours to different elements
getColours <- function(elements) {
  
  ##### Predefined selection of colours for elements
  if ( length(unique(elements)) == 3 ) {
    elements.colours <- c("powderblue", "red", "gray50")
  } else if ( length(unique(elements)) == 4 ) {
    elements.colours <- c("powderblue", "forestgreen", "red", "gray50")
  } else {
    elements.colours <- rainbow(length(elements))
  }
  
  f.elements <- factor(elements, levels = unique(elements))
  vec.elements <- elements.colours[1:length(levels(f.elements))]
  elements.colour <- rep(0,length(f.elements))
  for (i in 1:length(f.elements))
    elements.colour[i] <- vec.elements[ f.elements[i]==levels(f.elements)]
  
  return( list(vec.elements, elements.colour) )
}

##### Calculate TPM from RPKM (from http://luisvalesilva.com/datasimple/rna-seq_units.html )
tpm_from_rpkm <- function(x) {
  rpkm.sum <- colSums(x)
  return(t(t(x) / (1e-06 * rpkm.sum)))
}

##### Function to generate a full-resolution pdf image before generating a small image in the chunk (from https://stackoverflow.com/questions/37834053/what-is-a-simple-way-to-thumbnail-some-plots-in-r-markdown-knitr )
allow_thumbnails <- function(x, options) {
  if (!is.null(options$thumb)) {
    filename <- sprintf("%s.full.pdf", strsplit(basename(x), "\\.")[[1]][1])
    absolute_path <- file.path(dirname(x), filename)

    ##### Generate the full resolution pdf
    pdf(absolute_path, width = options$thumb$width, height = options$thumb$height)
      eval(parse(text = options$code))
    dev.off()

    ##### Add an html link to the low resolution png
    options$fig.link = absolute_path
  }

  knitr:::hook_plot_md_base(x, options)
}

##### Perform PCA. This function outputs a list with dataframe and samples colouring info ready for plotting
pca <- function(data, targets, title = "", report_dir, suffix = "" ) {

  ##### Keep only genes with variance > 0 across all samples
  rsd <- apply(data,1,sd)
  data.subset <- data[rsd>0,]
  
  ##### Perform PCA
  data.subset_pca <- prcomp(t(data.subset), scale=FALSE)
  
  ##### Get variance importance for all principal components
  importance_pca <- summary(data.subset_pca)$importance[2,]
  importance_pca <- paste(round(100*importance_pca, 2), "%", sep="")
  names(importance_pca) <- names(summary(data.subset_pca)$importance[2,])
    
  ##### Prepare data frame
  data.subset_pca.df <- data.frame(targets$Target, targets$Dataset, data.subset_pca$x[,"PC1"], data.subset_pca$x[,"PC2"], data.subset_pca$x[,"PC3"])
  colnames(data.subset_pca.df) <- c("Target", "Dataset", "PC1", "PC2", "PC3")
  
  ##### Assigne colours to targets and datasets
  targets.colour <- getColours(targets$Target)
  datasets.colour <- getColours(targets$Dataset)
  
  ##### Create a list with dataframe and samples colouring info
  pca.list <- list(data.subset_pca.df, importance_pca, targets.colour, datasets.colour)
  names(pca.list) <- c("pca.df", "importance_pca", "targets", "datasets")
  
  ##### Change the datasets levels order
  data.subset_pca.df$Target <- factor(data.subset_pca.df$Target, levels = unique(data.subset_pca.df$Target))
  
  ##### Generate PCA 2-D plot
  pca_plot <- plot_ly(data.subset_pca.df, x = ~PC1, y = ~PC2, color = ~Target, text=paste(targets$Target, rownames(data.subset_pca.df), sep=": "), colors = targets.colour[[1]], type='scatter', mode = "markers", marker = list(size=10, opacity = 0.7), width = 800, height = 500) %>%
  layout(title = title, xaxis = list(title = paste( "PC1", " (",importance_pca["PC1"],")",sep="")), yaxis = list(title = paste( "PC2", " (",importance_pca["PC2"],")",sep="")), margin = list(l=50, r=50, b=50, t=30, pad=4), autosize = FALSE, showlegend = TRUE, legend = list(orientation = "v", y = 0.9))

  ##### Generate Scree-plot
  data.subset_scree.df <- data.frame(paste0("PC ", c(1:length(importance_pca))), as.numeric(gsub("%", "",importance_pca)))
colnames(data.subset_scree.df) <- c("PC", "Variances")

  ##### The default order will be alphabetized unless specified as below
  data.subset_scree.df$PC <- factor(data.subset_scree.df$PC, levels = data.subset_scree.df[["PC"]])
  
  scree_plot <- plot_ly(data.subset_scree.df, x = ~PC, y = ~Variances, type = 'bar', width = 800, height = 350) %>%
    layout(title = title, xaxis = list(title = ""), margin = list(l=50, r=50, b=100, t=30, pad=4), autosize = F)
  
  ##### Create directory for the plots
  PCAplotDir <- paste(report_dir, "InputDataPlots", sep = "/")
  if ( !file.exists(PCAplotDir) ) {
    dir.create(PCAplotDir, recursive=TRUE)
  }
  
  ##### Save interactive plot as html file
  saveWidgetFix(pca_plot, file = paste0(PCAplotDir, "/pca_plot", suffix, ".html"))
  saveWidgetFix(scree_plot, file = paste0(PCAplotDir, "/scree_plot", suffix, ".html"))
  
  return( list(pca.list, pca_plot, scree_plot) )
  
  ##### Clean the space
  rm(data, targets, rsd, data.subset, data.subset_pca, importance_pca, data.subset_pca.df, targets.colour, datasets.colour, pca.list, data.subset_scree.df, PlotsDir)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Convert a vector of numbers into corresponding vector of their percentiles
perc.rank <- function(x) trunc(rank(x))*100/length(x)

##### Perform range standardization between 0 and 1 (for the cumulative sums)
standardization <- function(x) c(x-min(x))/(max(x)-min(x))

##### Calculating cumulative sum for while keeping the original data order
cumsum_ordered <- function(x) {
  
  ##### Perform range standardization between 0 and 1, otherwise the negative values are summed up
  standarised <- standardization(x)
  
  ##### Sort and cumsum values
  sorted_cumsum <- cumsum(sort(standarised))
  
  ##### Restore the original elements order
  ordered_cumsum <- sorted_cumsum[ names(standarised) ]
  
  ##### Perform range standardization between 0 and 1, otherwise the negative values are summed up
  standarised_cumsum <- standardization(ordered_cumsum)
  
  ##### Clean the space and return output
  rm(x, standarised, sorted_cumsum, ordered_cumsum)
  return( standarised_cumsum )
}

##### Check for nearest value in a vector
nearest_position <- function(vector, x) {
  
  y <- which.min(abs(vector - x))
  
  ##### Clean the space and return output
  rm(vector, x)
  return( y )
}

##### Calculate gene-wise median, sd, quantiles and cumulative franctions for expression data
exprGroupsStats_geneWise <- function(data, targets) {
  
  ##### Perform Z-score transformation of the expression values
  data.z <- t(apply(data, 1, scale, scale = TRUE))
  colnames(data.z) <- colnames(data)
  
  ##### Remove rows with potential NA's, which is due to SD = 0 across all samples
  data.z <- data.z[rowSums(!is.na(data.z)) > 0, , drop = FALSE]
  data <- data[ rownames(data) %in% rownames(data.z), , drop = FALSE]
  
  ##### Perform the gene-wise calculations across all groups
  ##### Convert a expression values into corresponding percentiles
  data.q <- t(apply(data, 1, perc.rank))
 
  ##### Calculate cumulative sums and perform range standardization between 0 and 1
  data.cum <- t(apply(data, 1, cumsum_ordered))
 
  ##### Create lists with stats for each group and gene
  targets.list <- unique(targets$Target)
  group_stats.list <- vector("list", length(targets.list))
  names(group_stats.list) <- targets.list
  
  #### For each group...
  for ( group in targets.list ) {
    
    ##### For groups with > 1 sample get the median values for each gene
    if ( sum(c(targets$Target %in% group), na.rm = TRUE) > 1 && nrow(data) > 1 )  {
      
      ##### Extract the median expression values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], rowMedians(data[ , colnames(data)[ targets$Target %in% group ] ]))

      ##### Extract the expression sd values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], rowSds(data[ , colnames(data)[ targets$Target %in% group ] ]))
      
      ##### Extract the median Z-scores
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], rowMedians(data.z[ , colnames(data)[ targets$Target %in% group ] ]))

      ##### Extract the median percentiles
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], rowMedians(data.q[ , colnames(data)[ targets$Target %in% group ] ]))
      
      ##### Extract the cumulative fraction corresponding to the median Z-score
      ##### First, need to get the position of the Z-score nearest to the median Z-score, and then extract the cumulative value at this position
      data.z.median_pos <- apply(data.z, 1, nearest_position, median(data.z[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data.cum[ data.z.median_pos ] )
      
      group_stats.list[[group]] <- as.data.frame(group_stats.list[[group]])
      names( group_stats.list[[group]] ) <- c("median", "sd", "z", "quantile", "cum")
      rownames( group_stats.list[[group]] ) <- rownames(data)
      
    } else if ( sum(c(targets$Target %in% group), na.rm = TRUE) > 1 && nrow(data) == 1 ) {
      
      ##### Extract the median expression values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], median(data[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))

      ##### Extract the expression sd values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], sd(data[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))
      
      ##### Extract the median Z-scores
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], median(data.z[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))

      ##### Extract the median percentiles
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], median(data.q[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))
      
      ##### Extract the cumulative fraction corresponding to the median Z-score
      ##### First, need to get the position of the Z-score nearest to the median Z-score, and then extract the cumulative value at this position
      data.z.median_pos <- nearest_position( data.z, median(data.z[ , colnames(data)[ targets$Target %in% group, drop = FALSE ] ]))
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data.cum[ data.z.median_pos ] ) 

      group_stats.list[[group]] <- as.data.frame(group_stats.list[[group]])
      names( group_stats.list[[group]] ) <- c("median", "sd", "z", "quantile", "cum")
      rownames( group_stats.list[[group]] ) <- rownames(data)
      
    } else {

      ##### Extract the median expression values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data[ , colnames(data)[ targets$Target %in% group ] ])

      ##### Extract the expression sd values
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], rep( NA, nrow(data)))
      
      ##### Extract the median Z-scores
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data.z[ , colnames(data)[ targets$Target %in% group ] ])

      ##### Extract the median percentiles
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data.q[ , colnames(data)[ targets$Target %in% group ] ])
      
      ##### Extract the median cumulative fraction
      group_stats.list[[group]] <- cbind(group_stats.list[[group]], data.cum[ , colnames(data)[ targets$Target %in% group ] ])
      
      group_stats.list[[group]] <- as.data.frame(group_stats.list[[group]])
      names( group_stats.list[[group]] ) <- c("median", "sd", "z", "quantile","cum")
      rownames( group_stats.list[[group]] ) <- rownames(data)
     }
  }
  
  ##### Finally, extract cumulative values for each gene within individual groups
  gene_stats.list <- vector("list", length(targets.list))
  names(gene_stats.list) <- targets.list
  
  #### For each group...
  for ( group in targets.list ) {
    
    ##### Extract per-gene expression values
    gene_stats.list[[group]]$median <- data[ , colnames(data)[ targets$Target %in% group ], drop = FALSE ]
    
    ##### Extract per-gene z-score values
    gene_stats.list[[group]]$z <- data.z[ , colnames(data.z)[ targets$Target %in% group ], drop = FALSE ]
    
    ##### Extract per-gene percentile values
    gene_stats.list[[group]]$q <- data.q[ , colnames(data.q)[ targets$Target %in% group ], drop = FALSE ]
    
    ##### Extract per-gene cumulative values
    gene_stats.list[[group]]$cum <- data.cum[ , colnames(data.cum)[ targets$Target %in% group ], drop = FALSE ]
  }
  
  ##### Clean the space and return output
  rm(data, targets, data.z, data.q, data.cum, targets.list, data.z.median_pos)
  return( list( group_stats.list, gene_stats.list) )
}

##### Calculate group-wise median, sd, quantiles and cumulative franctions for expression data from specific sample group
exprGroupStats_groupWise <- function(data, targets, target) {
  
  ##### Subset data for defined biological group
  data.group <- data[, targets$Target %in% target ]
  
  ##### For groups with > 1 sample get the median and standard deviation for each gene
  if ( !is.null(ncol(data.group)) )  {
    
    data.group.median <- rowMedians(data.group)
    names(data.group.median) <- rownames(data.group)
    data.group.median <- sort(data.group.median)
    data.group.sd <- rowSds(data.group)
    
  } else {
    data.group.median <- sort(data.group)
    data.group.sd <- rep( NA, length(data.group))
  }
  
  ##### Make sure the median and sd vectors have the same gene order
  names(data.group.sd) <- rownames(data.group)
  data.group.sd <- data.group.sd[names(data.group.median)]

  ##### Convert a expression values into corresponding percentiles
  data.group.q <- perc.rank(data.group.median)
  
  ##### Perform range standardization between 0 and 1 (for the cumulative sums), otherwise the negative values are summed up
  data.group.s <- sort(standardization(data.group.median))
  
  ##### Calculate cumulative sums and perform range standardization between 0 and 1 
  data.group.cum <- standardization(cumsum(data.group.s))
  
  ##### Perform Z-score transformation of the median expression values
  data.group.z <- scale(data.group.median, scale = FALSE)
  
  ##### Organise the data into data frame
  data.group.df <- as.data.frame(cbind( data.group.median, data.group.sd, data.group.z, data.group.q, data.group.cum))
  names(data.group.df) <- c("median", "sd", "z", "quantile", "cum")
  
  ##### Clean the space and return output
  rm(data, targets, target, data.group, data.group.median, data.group.sd, data.group.q, data.group.s, data.group.cum, data.group.z)
  return( data.group.df )
}

##### Generate cumulative distribution function (CDF) plot for selected gene. If option "addBoxPlot" = TRUE, then generate additional boxplot below to show the data variance for selected gene in individual groups
cdfPlot <- function(gene, data, targets, sampleName, int_cancer, ext_cancer, comp_cancer, add_cancer = NULL, addBoxPlot = FALSE, scaling = "gene-wise", report_dir) {
  
  ##### Remove the internal reference cohort data if the patient samples origins from other tissue. Of note, the internal reference cohort was only used to process the in-house data (including the investigated patient sample) and to correct batch-effects
  if ( comp_cancer != int_cancer ) {
    targets <- targets[ targets$Target %!in% int_cancer, ]
    data <- data[ ,rownames(targets) ]
  }
  
  ##### Initiate lists with stats for each group
  targets.list <- unique(targets$Target)
  group.z <- vector("list", length(targets.list))
  names(group.z) <- targets.list
  
  ##### .... and for selected gene
  group.z.gene <- vector("list", length(targets.list))
  names(group.z.gene) <- targets.list

  ##### Get expression-related stats for each group
  ##### ... from gene-wise approach 
  if ( scaling == "gene-wise" ) {

    ##### Get stats for each group
    gene.data <- data[ gene, , drop = FALSE]
    group.z.gene <- exprGroupsStats_geneWise(gene.data, targets)[[1]]
    
    ##### ... and for each sample in individual groups
    gene.stats <- exprGroupsStats_geneWise(gene.data, targets)[[2]]

    for ( group in targets.list ) {
        group.z[[ group]] <- cbind(t(gene.stats[[ group]]$median), t(gene.stats[[ group]]$z), t(gene.stats[[ group]]$q), t(gene.stats[[ group]]$cum) )
        group.z[[ group]] <- as.data.frame(group.z[[ group]])
        colnames(group.z[[ group]]) <- c("median", "z", "quantile", "cum")
    }
    
    group.z[[ sampleName ]] <- do.call("rbind", group.z)
    
  ##### ... or from group-wise approach
  } else {
    group.z[[ sampleName ]] <- exprGroupStats_groupWise(data, targets, sampleName)
    group.z[[ ext_cancer ]] <- exprGroupStats_groupWise(data, targets, ext_cancer)
    
    ##### Extract expression for selected genes
    group.z.gene[[ sampleName ]] <- group.z[[ sampleName ]][ rownames(group.z[[ sampleName ]]) %in% gene, ]
    group.z.gene[[ ext_cancer ]] <- group.z[[ ext_cancer ]][ rownames(group.z[[ ext_cancer ]]) %in% gene, ]
    
    ##### Add info for internal cohort
    if ( comp_cancer == int_cancer ) {
      group.z[[ int_cancer ]] <- exprGroupStats_groupWise(data, targets, int_cancer)
      group.z.gene[[ int_cancer ]] <- group.z[[ int_cancer ]][ rownames(group.z[[ int_cancer ]]) %in% gene, ]
    }
    
    ##### Add info for additional cancer type is specified
    if ( !is.null(add_cancer) ) {
      group.z[[ add_cancer ]] <- exprGroupStats_groupWise(data, targets, add_cancer)
      group.z.gene[[ add_cancer ]] <- group.z[[ add_cancer ]][ rownames(group.z[[ add_cancer ]]) %in% gene, ]
    }
  }
  
  ##### Generate box-plot for selected gene
  if ( addBoxPlot ) {
    ##### Perform Z-score transformation of the median expression values
    if ( scaling == "gene-wise" ) {
      
      data.z <- t(scale(t(data)))
    } else {
      data.z <- scale(data, scale = FALSE)
    }
    
    targets$Target[ targets$Target==sampleName ] <- "Patient"
    gene.expr.df <- data.frame(targets$Target, data.z[gene, ])
    colnames(gene.expr.df) <- c("Group", "Expression")
    
    ##### Reorder groups
    if ( !is.null(add_cancer) ) {
      gene.expr.df$Group <- factor(gene.expr.df$Group, levels=c( add_cancer, ext_cancer, int_cancer, "Patient"))
      group.colours <- c("forestgreen", "cornflowerblue", "red", "black")
    } else {
      gene.expr.df$Group <- factor(gene.expr.df$Group, levels=c(ext_cancer, int_cancer, "Patient"))
      group.colours <- c("cornflowerblue", "red", "black")
    }
    
    p2 <- plot_ly(gene.expr.df, x= ~Expression, color = ~Group, type = 'box', jitter = 0.3, pointpos = 0, boxpoints = 'all', colors = group.colours, opacity = 0.5, orientation = 'h', width = 800, height = 400, showlegend=FALSE)
  }
  
  ##### Generate interactive CDF plot with plotly
  ##### Include the internal reference cohort in the plot
  if ( comp_cancer == int_cancer ) {
    p1 <- plot_ly(group.z[[ sampleName ]], x = ~z, color = I("black"), width = 700, height = 200) %>%
    
      ##### Add sample data
      add_markers(y = group.z.gene[[ sampleName ]]$quantile, x = group.z.gene[[ sampleName ]]$z,
                  text = rownames(group.z.gene[[ sampleName ]] ),
                  name = "Patient",
                  marker = list(size = 12, color = "black"),
                  showlegend = TRUE) %>%
    
      add_lines(y = group.z[[ sampleName ]]$quantile, x = group.z[[ sampleName ]]$z, 
                line = list(color = "grey"),
                text = rownames( group.z[[ sampleName ]] ),
                name = "Patient", showlegend = FALSE) %>%
        
      ##### Add int_cancer data
      add_markers(y = group.z.gene[[ int_cancer ]]$quantile, x =  group.z.gene[[ int_cancer ]]$z,
                  text = rownames( group.z.gene[[ int_cancer ]]),
                  name = int_cancer,
                  marker = list(size = 12, opacity = 0.5, color = "red"),
                  showlegend = TRUE) %>%
    
      add_lines(y = group.z[[ int_cancer ]]$quantile, x = group.z[[ int_cancer ]]$z, opacity = 0.5,
                line = list(color = "red", dash = "dash"),
                text = rownames( group.z[[ int_cancer ]] ),
                name = int_cancer, showlegend = FALSE) %>%
          
      ##### Add ext_cancer data
      add_markers(y = group.z.gene[[ ext_cancer ]]$quantile, x =  group.z.gene[[ ext_cancer ]]$z,
                  text = rownames( group.z.gene[[ ext_cancer ]] ),
                  name = ext_cancer,
                  marker = list(size = 12, opacity = 0.5, color = "cornflowerblue"),
                  showlegend = TRUE) %>%
    
      add_lines(y = group.z[[ ext_cancer ]]$quantile, x = group.z[[ ext_cancer ]]$z, opacity = 0.5,
                line = list(color = "cornflowerblue", dash = "dash"),
                text = rownames( group.z[[ ext_cancer ]] ),
                name = ext_cancer, showlegend = FALSE) %>%
      
      ##### Add quantile lines
      add_lines(y = seq(0,100,10), x = rep(quantile(group.z[[ sampleName ]]$z)[2], 11), opacity = 0.5,
                line = list(color = "gray", dash = "dash"),
                name = "Q1", showlegend = FALSE) %>%
      
      add_lines(y = seq(0,100,10), x = rep(quantile(group.z[[ sampleName ]]$z)[3], 11), opacity = 0.5,
                line = list(color = "gray", dash = "dash"),
                name = "Q2", showlegend = FALSE) %>%
      
      add_lines(y = seq(0,100,10), x = rep(quantile(group.z[[ sampleName ]]$z)[4], 11), opacity = 0.5,
                line = list(color = "gray", dash = "dash"),
                name = "Q3", showlegend = FALSE) %>% 
      
          layout(title = gene, xaxis = list(title = "mRNA expression (Z-score)", zeroline = FALSE, range = c(min(group.z[[ sampleName ]]$z)-1.5, max(group.z[[ sampleName ]]$z)+1.5)),
             yaxis = list(title = "Percentile"),
             legend = list(orientation = 'v', x = 0.02, y = 1, bgcolor = "white")
      )
  
  ##### Skip the internal reference cohort in the plot
  } else {
    p1 <- plot_ly(group.z[[ sampleName ]], x = ~z, color = I("black"), width = 700, height = 200) %>%
  
    ##### Add sample data
    add_markers(y = group.z.gene[[ sampleName ]]$quantile, x = group.z.gene[[ sampleName ]]$z,
                text = rownames(group.z.gene[[ sampleName ]] ),
                name = "Patient",
                marker = list(size = 12, color = "black"),
                showlegend = TRUE) %>%
  
    add_lines(y = group.z[[ sampleName ]]$quantile, x = group.z[[ sampleName ]]$z, 
              line = list(color = "grey"),
              text = rownames( group.z[[ sampleName ]] ),
              name = "Patient", showlegend = FALSE) %>%
        
    ##### Add ext_cancer data
    add_markers(y = group.z.gene[[ ext_cancer ]]$quantile, x =  group.z.gene[[ ext_cancer ]]$z,
                text = rownames( group.z.gene[[ ext_cancer ]] ),
                name = ext_cancer,
                marker = list(size = 12, opacity = 0.5, color = "cornflowerblue"),
                showlegend = TRUE) %>%
  
    add_lines(y = group.z[[ ext_cancer ]]$quantile, x = group.z[[ ext_cancer ]]$z, opacity = 0.5,
              line = list(color = "cornflowerblue", dash = "dash"),
              text = rownames( group.z[[ ext_cancer ]] ),
              name = ext_cancer, showlegend = FALSE) %>%
    
    ##### Add quantile lines
    add_lines(y = seq(0,1,0.1), x = rep(quantile(group.z[[ sampleName ]]$z)[2], 11), opacity = 0.5,
              line = list(color = "gray", dash = "dash"),
              name = "Q1", showlegend = FALSE) %>%
    
    add_lines(y = seq(0,1,0.1), x = rep(quantile(group.z[[ sampleName ]]$z)[3], 11), opacity = 0.5,
              line = list(color = "gray", dash = "dash"),
              name = "Q2", showlegend = FALSE) %>%
    
    add_lines(y = seq(0,1,0.1), x = rep(quantile(group.z[[ sampleName ]]$z)[4], 11), opacity = 0.5,
              line = list(color = "gray", dash = "dash"),
              name = "Q3", showlegend = FALSE) %>% 
    
        layout(title = gene, xaxis = list(title = "mRNA expression (Z-score)", zeroline = FALSE, range = c(min(group.z[[ sampleName ]]$z)-1.5, max(group.z[[ sampleName ]]$z)+1.5)),
           yaxis = list(title = "Percentile"),
           legend = list(orientation = 'v', x = 0.02, y = 1, bgcolor = "white")
    )
  }
  
  ##### Combine CDF plot with boxplot if this option is selected
  if ( addBoxPlot ) {
    p1_2 <- subplot(p1, p2, nrows = 2, shareX = TRUE, shareY = FALSE, titleY = TRUE, heights = c(0.7, 0.3)) %>%
  layout(xaxis = list(title = "mRNA expression (Z-score)", zeroline = FALSE, range = c(min(group.z[[ sampleName ]]$z)-1.5, max(group.z[[ sampleName ]]$z)+1.5)),
          yaxis = list(title = "Percentile"),
          legend = list(orientation = 'v', x = 0.02, y = 1, bgcolor = "white"),
          yaxis2 = list( title =""), xaxis2 = list(title = paste0(gene, " mRNA expression (Z-score)")), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = FALSE,
         showlegend=TRUE, showlegend2=FALSE)
    
    return( p1_2 )
    
  } else {
    return( p1 )
  }
  ##### Clean the space
  rm(gene, targets, data, sampleName, targets.list, group.z, group.z.gene, gene.data, gene.stats, data.z, gene.expr.df, group.colours)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Convert density to counts
density2freq <- function(density) {
  freq = length(density)/sum(density) * density
  return(freq)
}

##### Generate density and expression distribution plots for selected gene, highlighting samples of interest
densityPlot <- function(gene, data, main_title, x_title, sampleName, distributions = NULL, scaling = "gene-wise") {
  
  if ( scaling == "gene-wise" ) {
    data.z <- t(scale(t(data)))
  } else {
    data.z <- scale(data, scale = FALSE)
  }
  
  ##### Used data for user-defined genes
  data.z <- data.z[ gene, ,drop=FALSE]

  ##### Create data frame and fill it with expression and density values for each sample for selected gene
  data.df <- data.frame(gene = "Observed distribution", sample = colnames(data.z)[order(data.z)], expr = sort(data.z), dens = density2freq(density(data.z, n=ncol(data.z))$y))
  
  ##### Generate values to generate various distributions
  if ( !is.null(distributions) ) {
    
    ##### Use the density values obtained from the expression values
    expr.sorted <- sort(data.z)
    
    ##### Get min and max values based on the expression data
    data.x <- seq(min(expr.sorted), max(expr.sorted), length.out = ncol(data.z))
    
    ##### Create empty data frame
    data.df.dist <- data.frame(matrix(ncol = 4, nrow = 0))
    colnames(data.df.dist) <- c("gene", "sample", "expr", "dens")
    
    ##### Generate y-values to mirror distributions of interest
    ##### Generate y-values for normal distribution. Useful resource https://stats.idre.ucla.edu/r/modules/probabilities-and-distributions/
    if ( "normal" %in% tolower(distributions) ) {
      data.y <- dnorm(data.x, mean = mean(data.x), sd = (max(data.x)-mean(data.x))/5)
      data.df.dist <- rbind(data.df.dist, data.frame(gene="Normal distribution", sample = colnames(data.z)[order(data.z)], expr=data.x, dens=density2freq(data.y)))
    } 
    
    ##### Generate x- and y-values for binomial distribution. Useful link https://stat.ethz.ch/R-manual/R-devel/library/stats/html/Binomial.html
    if ( "binomial" %in% tolower(distributions) ) {
      data.x <- 1:ncol(data.z)
      data.y <- dbinom(data.x, ncol(data.z), 0.25)
      data.x <- rescale(data.x, to = c(min(expr.sorted), max(expr.sorted)))
      data.df.dist <- rbind(data.df.dist, data.frame(gene="Binomial distribution (p=0.25)", sample = colnames(data.z)[order(data.z)], expr=data.x, dens=density2freq(data.y)))
      
      data.x <- 1:ncol(data.z)
      data.y <- dbinom(data.x, ncol(data.z), 0.75)
      data.x <- rescale(data.x, to = c(min(expr.sorted), max(expr.sorted)))
      data.df.dist <- rbind(data.df.dist, data.frame(gene="Binomial distribution (p=0.75)", sample = colnames(data.z)[order(data.z)], expr=data.x, dens=density2freq(data.y)))
    }
    
    ##### Draw n/2 samples from a normal distributions with one median and another n/2 samples from a second normal distribution with a different median. Useful link                  https://stats.stackexchange.com/questions/355344/simulating-a-bimodal-distribution-in-the-range-of-15-in-r
    if ( "bimodal" %in% tolower(distributions) ){
      data.x1 <- seq(min(expr.sorted), median(expr.sorted), length.out = ncol(data.z)/2)
      data.x2 <- seq(median(expr.sorted), max(expr.sorted), length.out = ncol(data.z)/2)
      
      ##### Combine both normal distributions to generate a bimodal distribution. Make sure the the length of this vector is equal to the number samples in the data
      data.x <- c(data.x1, data.x2)
      data.x <- data.x[1:ncol(data.z)]
      
      ##### Generate y-values for bimodal distribution
      data.y <- c(dnorm(data.x1, mean = mean(data.x1), sd = (max(data.x1)-mean(data.x1))/3), dnorm(data.x2, mean = mean(data.x2), sd = (max(data.x2)-mean(data.x2))/3))
      data.y <- data.y[1:ncol(data.z)]
      
      ##### Add bimodal dist values to the distribution dataframe
      data.df.dist <- rbind(data.df.dist, data.frame(gene = "Bimodal distribution", sample = colnames(data.z)[order(data.z)], expr = data.x, dens = density2freq(data.y)))
    }
    
    data.df <- rbind(data.df, data.df.dist)
    
    ##### Extract expression for selected sample in the distributions dataframe
    data.df.selected <- data.df[ sampleName == data.df$sample, ]
  }
  
  ##### Get min and max values based on the expression data
  den.x <- sort(data.df$expr)
  den.y <- sort(data.df$dens)
  
  ##### Assign colours to distributions
  genes.colour <- getColours(rev(unique(data.df$gene)))
  
  ##### Generate interactive density plot
  p <- plot_ly(data.df, x = ~expr, y = ~dens, type = 'scatter', mode = 'lines', color = ~gene, colors = genes.colour[[1]], width = 750, height = 200) %>%
    add_markers(y = data.df.selected$dens, x = data.df.selected$expr, 
                name = "Patient",
                text = "Patient",
                mode = 'markers',
                marker = list(size = 8, colors = data.df.selected$sample, color = rep(I("black"), each = nrow(data.df.selected)), line = list(color = "grey", width = 2)),
                showlegend = TRUE,
                inherit = FALSE) %>%
     layout(title = main_title,
           xaxis = list(title = x_title, range = c(den.x[1],den.x[length(den.x)])),
           yaxis = list (title = 'Weight', range = c(den.y[1],den.y[length(den.y)]), side = "right"),
           legend = list(orientation = 'h', y = 1.3))
  
  return( p )
  
  ##### Clean the space
  rm(gene, expr.sorted)
  rm(list = ls(pattern='^data*'))
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Generate box-plot for selected genes, highlighting samples of interest
barPlot <- function(gene, data, targets, y_title = "Counts", sampleName,  ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = NULL ) {

  ##### Used data for user-defined genes
  data <- data[ gene, ,drop=FALSE]
  
  ##### Prepare data frame
  targets$Target[ targets$Target==sampleName ] <- "Patient"
  rownames(targets)[ rownames(targets)==sampleName ] <- "Patient"
  data.df <- data.frame(targets$Target, rownames(targets), as.numeric(data))
  colnames(data.df) <- c("Group","Sample", "Data")
  
  ##### Reorder groups and add colours
  if ( !is.null(add_cancer) ) {
    data.df$Group <- factor(data.df$Group, levels=c( add_cancer, ext_cancer, int_cancer, "Patient"))
    group.colours <- c("forestgreen", "cornflowerblue", "red", "black")
  } else {
    data.df$Group <- factor(data.df$Group, levels=c(ext_cancer, int_cancer, "Patient"))
    group.colours <- c("cornflowerblue", "red", "black")
  }
  
  ##### The default order will be alphabetized unless specified as below
  data.df$Sample <- factor(data.df$Sample, levels = data.df[["Sample"]])
  p <- plot_ly(data.df, x = ~Sample, y = ~Data, color = ~Group, type = 'bar', colors = group.colours, width = 750, height = 200) %>%
    layout(title = "", xaxis = list( title = "", showticklabels = FALSE), yaxis = list(title = y_title), autosize = F, legend = list(orientation = 'h', y = 1.2), showlegend=TRUE)
  
  return( p )
  
  ##### Clean the space
  rm(list = ls(pattern='^data*'))
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Generate boxplot presenting expression profiles for selected set of genes
glanceExprPlot <- function(genes, data, targets, sampleName, int_cancer, ext_cancer, comp_cancer, add_cancer = NULL, hexcode, type = "z", sort = "diff", scaling = "gene-wise", report_dir) {
  
  if ( comp_cancer != int_cancer ) {
    targets <- targets[ targets$Target %!in% int_cancer, ]
    data <- data[ ,rownames(targets) ]
  }
  
  ##### Perform Z-score transformation of the median expression values
  if ( scaling == "gene-wise" ) {
    
    data.z <- t(scale(t(data)))
    y_title <- "mRNA expression (Z-score)"
    
    if ( type == "perc" ) {
      ##### Convert a expression values into corresponding percentiles
      data.z <- t(apply(data.z, 1, perc.rank))
      y_title <- "mRNA expression (percentile)"
    }
    
  } else {
    data.z <- scale(data, scale = FALSE)
    
    if ( type == "perc" ) {
      ##### Convert a expression values into corresponding percentiles
      data.z <- t(apply(data.z, 1, perc.rank))
    }
  }
  
  targets$Target[ targets$Target==sampleName ] <- "Patient"
  
  ##### Make sure that all genes are present in the expression matrix
  genes <- genes[ genes %in% rownames(data.z) ]
  
  ##### Genes sorting for visualisation
  ##### Sort genes by the greatest difference between the patient and the "comp_cancer" cohort
  if ( sort == "diff" ) {
    comp_cancer.medians <- rowMedians( data.z[ genes ,targets$Target==comp_cancer ] )
    names(comp_cancer.medians) <- genes
    comp_cancer.medians.diff <- comp_cancer.medians - data.z[ genes ,targets$Target=="Patient" ]
    genes <- genes[ order(comp_cancer.medians.diff) ]
  
  ##### Sort genes alphabetically
  } else if (sort == "alphabetically") {
    genes <- genes[ order(genes) ]
  }

  ##### Prepare dataframe for plotly
  gene.expr.df <- NULL
  
  for ( gene in genes ) {
    gene.expr.df <- rbind(gene.expr.df, data.frame(gene, targets$Target, data.z[gene, ]))
  }
  colnames(gene.expr.df) <- c("Gene", "Group", "Expression")
  
  ##### Reorder groups
  if ( !is.null(add_cancer) ) {
    gene.expr.df$Group <- factor(gene.expr.df$Group, levels=c("Patient", int_cancer, ext_cancer, add_cancer))
    group.colours <- c(I("black"), "red", "cornflowerblue", "forestgreen")
    
  } else {
    gene.expr.df$Group <- factor(gene.expr.df$Group, levels=c("Patient", int_cancer, ext_cancer))
    group.colours <- c(I("black"), "red", "cornflowerblue")
  }
  
  p <- plot_ly( gene.expr.df, x = ~Gene, y = ~Expression, color = ~Group, type = "box", colors = group.colours, opacity=0.3, showlegend = TRUE, width = 800, height = 400 ) %>% 
    add_markers(x = ~Gene[ gene.expr.df$Group %in% "Patient" ], y = ~Expression[ gene.expr.df$Group %in% "Patient" ], color = ~Group[ gene.expr.df$Group %in% "Patient" ], marker = list(size = 7), opacity=1, showlegend = FALSE) %>%
    
    layout(boxmode = "group", xaxis = list(title = ""), yaxis = list(title = y_title), legend = list( orientation = 'h', y = max(gene.expr.df$Expression), yancho = "top", bgcolor = "white"))
    
  ##### Create directory for "at glance" plots
  PlotsDir <- paste(report_dir, "glanceExprPlots", sep = "/")
    
  if ( !file.exists(PlotsDir) ) {
    dir.create(PlotsDir, recursive=TRUE)
  }
  
  ##### Save interactive plot as html file
  saveWidgetFix(p, file = paste(PlotsDir, paste0(hexcode, "_glance_expr_plot.", type, ".html"), sep = "/"))
  
  return( p )

  ##### Clean the space and return output
  rm(targets, data, sampleName, data.z, y_title, genes, comp_cancer.medians, comp_cancer.medians.diff, gene.expr.df, group.colours)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Generate scatterplot with per-gene expression values (y-axis), CN values (x-axis) and mutation status info (colours), if provided
mutCNexprPlot <- function(data, alt_data = FALSE, cn_bottom = cn_bottom, cn_top = cn_top, comp_cancer, type = "z", report_dir) {
  
  ##### Extract info for genes to be annotated on the plot
  genes2annot <- data[ data$CN >= cn_top | data$CN <= cn_bottom ,]$Gene
  
  if ( length(genes2annot) == 0 ) {
    genes2annot <- ""
  }
  
  if ( type == "z" ) {
    names(data)[ names(data) %in% "Z_score_diff" ] <- "Expr"
    y_title <- paste0("mRNA expression (Z-score [Patient vs ", comp_cancer, "])")
      
  } else if ( type == "perc" ) {
    names(data)[ names(data) %in% "Perc_diff" ] <- "Expr"
    y_title <- paste0("mRNA expression (percentile [Patient vs ", comp_cancer, "])")
  }
  
  ##### Generate scatterplot with per-gene expression values (y-axis) (difference between Patient's and [comp_cancer] data), CN values (x-axis) and mutation status info (colours)
  if ( alt_data ) {
    p <- plot_ly(type='scatter', mode = "markers", width = 800, height = 600, showlegend = FALSE) %>%
      
      add_markers(data = data, y = ~Expr, x = ~CN, 
                name = ~Gene,
                text = paste0("Gene: ", data$Gene,  "\nAlterations: ", data$Alterations),
                mode = 'markers',
                marker = list(size=10, symbol="circle"),
                color = ~Gene,
                showlegend = TRUE,
                legendtitle=TRUE, 
                inherit = FALSE) %>%
      
      add_annotations( data = data[ data$CN >= cn_top | data$CN <= cn_bottom ,], text=genes2annot,
                      x=~CN, xanchor="left",
                      y=~Expr, yanchor="top",
                      font = list(color = "Grey", size = 10),
                      legendtitle=TRUE, showarrow=FALSE ) %>%
      
      layout( xaxis = list(title = "CN value"), yaxis = list(title = y_title), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F, legend = list( orientation = 'v', x=1, y=0.97, yanchor="top"), showlegend=TRUE)
  
  ##### Generate scatterplot with per-gene expression values (y-axis) and CN values (x-axis)
  } else {
    p <- plot_ly(data, x = ~CN, y = ~Expr, text=~Gene, color = ~Gene, type='scatter', mode = "markers", marker = list(size=10, symbol="circle"), width = 800, height = 600) %>%
      
      add_annotations( data = data[ data$CN >= cn_top | data$CN <= cn_bottom ,], text=~Gene,
                      x=~CN, xanchor="left",
                      y=~Expr, yanchor="top",
                      font = list(color = "Grey",
                      size = 10),
                      legendtitle=TRUE, showarrow=FALSE ) %>%
      
      layout( xaxis = list(title = "CN value"), yaxis = list(title =  y_title), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F, legend = list( orientation = 'v', y=0.8, yanchor="top"), showlegend=TRUE)
  }
  
  ##### Create directory for the plots
  mutCNexprPlotDir <- paste(report_dir, "cn_expr_plot", sep = "/")
  if ( !file.exists(mutCNexprPlotDir) ) {
    dir.create(mutCNexprPlotDir, recursive=TRUE)
  }
  
  ##### Save interactive plot as html file
  saveWidgetFix(p, file = paste(mutCNexprPlotDir, paste0("cn_expr_plot.", type, ".html"), sep = "/"))
    
  return( p )
  
  ##### Clean the space and return output
  rm(data, alt_data, genes2annot, y_title)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Fusion visualisation 
arriba_plots <- function(arriba_file, arriba_results, results_dir) {

  ##### Get path to fusion visualisation  pdf file
  arriba_dir <- unlist(strsplit(arriba_file, split='/', fixed=TRUE))
  arriba_plots.pdf <- list.files(paste(arriba_dir[1:length(arriba_dir)-1], collapse = "/"), pattern="\\.pdf$")
  arriba_dir <- paste(arriba_dir[1:length(arriba_dir)-1], collapse = "/")
  arriba_plots.pdf <- paste(arriba_dir, arriba_plots.pdf, sep = "/")
    
  ##### Create directory for results
  if ( !file.exists(results_dir) ) {
    dir.create(results_dir, recursive=TRUE)
  }
  
  ##### Export pdf images to png
  for ( i in 1:nrow(arriba_results) ) {
    arriba_plots.png <- gsub(":", ".", paste0(results_dir, "/", make.names(paste(arriba_results$X.gene1[i], arriba_results$gene2[i], sep = "__")), "_", arriba_results$breakpoint1[i], "-", arriba_results$breakpoint2[i], ".png"))
    fusion <- pdf_render_page(arriba_plots.pdf, page = i, dpi = 300, numeric = TRUE, opw = "", upw = "")
    writePNG(fusion, arriba_plots.png)
  }

  ##### Clean the space
  rm(arriba_plots.pdf, arriba_plots.png, fusion)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Generate table with coloured cells indicating expression values for selected genes
exprTable <- function(genes, keep_all = FALSE, data, cn_data = NULL, sv_data = NULL, cn_decrease = TRUE, targets, sampleName, int_cancer, ext_cancer, comp_cancer, add_cancer = NULL, genes_annot = NULL, oncokb_annot = NULL, cancer_genes = NULL, mut_annot = NULL, fusion_genes = NULL, ext_links = FALSE, type = "z", scaling = "gene-wise") {
  
  ##### Check which of the selected genes are not present in the expression data
  genes.absent <- genes[ genes %!in% rownames(data) ]
    
  ##### Initiate dataframe for expression median values in each group
  targets.list <- unique(targets$Target)
  group.z <- as.data.frame(matrix(NA, ncol = length(targets.list), nrow = nrow(data)))
  colnames(group.z) <- targets.list
  rownames(group.z) <- rownames(data)
    
  ##### Perform scaling gene-wise
  if ( scaling == "gene-wise" ) {
    ##### Calculate z-score for each group  
    group.stats <- exprGroupsStats_geneWise(data, targets)[[1]]
    
    ##### Make sure to include only genes for which Z-scores were calaculated  (genes with SD = 0 across all samples will give NA)
    group.z <- group.z[ rownames(group.z) %in% rownames(group.stats[[targets.list[1]]]), ]
    
    #### Present expression data as percentiles or z-score values (default)
    for ( group in targets.list ) {
      if ( type == "perc" ) {
        group.z[, group] <- round(group.stats[[ group ]]$quantile, digits=1)
      } else {
        group.z[, group] <- round(group.stats[[ group ]]$z, digits=2)
      }
    }
    
  ##### Perform scaling group-wise
  } else {
    for ( group in targets.list ) {
      
      ##### Calculate z-score for each group  
      group.stats <- exprGroupStats_groupWise(data[rownames(group.z), ], targets, group)
      group.stats <- group.stats[order(rownames(group.stats)), ]
      
      #### Present expression data as percentiles or z-score values (default)
      if ( type == "perc" ) {
        group.z[, group] <- round(group.stats$quantile, digits=1)
      } else {
        group.z[, group] <- round(group.stats$z, digits=2)
      }
    } 
  }
  
  ##### If additional cancer type is defined then remove it from the data
  if ( !is.null(add_cancer) ) {
    group.z <- group.z[ , names(group.z) %!in% add_cancer ]
    targets <- targets[ targets$Target %!in% add_cancer, ]
    targets.list <- targets.list[ targets.list %!in% add_cancer ]
  }
  
  ##### Compute Z-scores sd for each gene across groups
  group.z <- cbind(group.z, round(rowSds(as.matrix(group.z)), digits = 2))
  names(group.z)[ncol(group.z)] <- "SD"
  
  ##### Calculate Z-score differneces between investigated sample and median values in the cancer group of interest
  group.z <- cbind(group.z, round((group.z[, sampleName] - group.z[, comp_cancer]), digits = 2))
  names(group.z)[ncol(group.z)] <- "Diff"
  
  ##### Add NAs for genes that are absent in the expression matrix. In the "Patient vs [comp_cancer]" columns provide "0"s to facilitate interactive sorting the table. These will appear in blank cells in the table
  if ( length(genes.absent) > 0 ) {
    
    NAs.df <- data.frame(matrix(NA, ncol = ncol(group.z), nrow = length(genes.absent)))
    names(NAs.df) <- names(group.z)
    rownames(NAs.df) <- genes.absent
    NAs.df[ names(NAs.df) %in% "Diff" ] <- 0
    group.z <- rbind( group.z,  NAs.df)
  }
  
  ##### Change sample ID to "Patient" for better visualisation
  names(group.z)[names(group.z)==sampleName] <- "Patient"
  targets.list[targets.list==sampleName] <- "Patient"
  
  ##### Reorder groups
  group.z <- cbind(group.z[ , c(ext_cancer, int_cancer, "Patient")], group.z[, c("SD", "Diff" )])
  
  ##### Add "Gene" column to facilitate adding annotations
  group.z$Gene <- rownames(group.z)
  
  ##### Add genes annotation
  if ( !is.null(genes_annot) ) {
    ##### Remove rows with duplicated gene symbols
    if ( "SYMBOL" %in% names(genes_annot) ) {
      genes_annot <- genes_annot[!duplicated(genes_annot$SYMBOL),]  
    }
    
    ##### Merge the dataframe with groups median expression values and gene annotations
    group.z <- merge(genes_annot, group.z, by.x="SYMBOL", by.y="Gene", all = TRUE, sort = FALSE)
    names(group.z) <- gsub("SYMBOL", "Gene", names(group.z))
  }
  
  ##### Define colours for cells background for each group and the patient vs [comp_cancer] difference
  ##### Initiate dataframe for expression median values in each group
  brks.q <- as.data.frame( matrix(NA, ncol = length(targets.list), nrow = length(seq(.05, .95, .0005)) ))
  colnames(brks.q) <- targets.list
  clrs.q <- as.data.frame( matrix(NA, ncol = length(targets.list), nrow = length(seq(.05, .95, .0005))+1 ))
  colnames(clrs.q) <- targets.list
  
  for ( group in c(targets.list, "Diff") ) {
    brks.q[[group]] <- quantile(group.z[, group], probs = seq(.05, .95, .0005), na.rm = TRUE)
    
    clrs_pos.q <- round(seq(255, 150, length.out = length(brks.q[[group]])/2 + 1.5), 0) %>%
    {paste0("rgb(255,", ., ",", ., ")")}
    clrs_neg.q <- rev(round(seq(255, 150, length.out = length(brks.q[[group]])/2 - 0.5), 0)) %>%
    {paste0("rgb(", .,",", .,",", "255)")}
    clrs.q[[group]] <- c(clrs_neg.q, clrs_pos.q)
  }
  
  ##### Subset the expression data to include only the user-defined genes
  group.z <- group.z[ group.z$Gene %in% genes, ]
    
  #### Add variants information to the expression table - if exists. Note, "TIER" and "CONSEQUENCE" columns are required
  if( !is.null(mut_annot) && "TIER" %in% colnames(mut_annot) && length(genes) > 0 ) {
    mut_annot <- mut_annot[mut_annot$SYMBOL %in% genes,]
    
    #### keep only varaints that has the lowest tier value. Multiple varaints detected in same gene but with higher tier will be added to additional column "CONSEQUENCE_OTHER". Applies to the ones that may have multiple mutations and hence tiers
    ##### First, create a list of genes to store multiple variants
    mut_consequence <- vector("list", length(unique(mut_annot$SYMBOL)))
    mut_consequence  <- setNames(mut_consequence,  unique(mut_annot$SYMBOL) )
    
    ##### Record all varaints detected in individual genes
    if ( nrow(mut_annot) > 0 ) {
      for ( i in 1:nrow(mut_annot) ) {
        mut_consequence[[ mut_annot$SYMBOL[i] ]] <- unique(c( mut_consequence[[ mut_annot$SYMBOL[i] ]], mut_annot$CONSEQUENCE[i] ))
      }
      
      mut_annot$CONSEQUENCE_OTHER <- "-"
    }
    
    ##### Remove the first elements since these variant consequences will be reported as the "canonical" CONSEQUENCE
    mut_consequence <- lapply(mut_consequence, function(x) x[-1])
    
    ##### Order variant entires based on tier info, to make sure that the varaints with the lowest tier are reported first
    mut_annot <- mut_annot[ order(mut_annot$TIER), ]
    
    ##### Remove rows with duplicated gene symbols
    mut_annot <- mut_annot[!duplicated(mut_annot$SYMBOL),]  
    rownames(mut_annot) <- mut_annot$SYMBOL
    
    ##### Add other provided variants consequences for individual genes
    for ( gene in rownames(mut_annot) ) {
      if ( length(mut_consequence[[ gene ]]) > 0 ) {
        mut_annot$CONSEQUENCE_OTHER[ match(gene, mut_annot$SYMBOL)  ] <- mut_consequence[[ gene ]]
      }
    }
    
    #### merge the variants information with the dataframe
    group.z <- merge(group.z, mut_annot, by.x = "Gene", by.y = "SYMBOL", all = TRUE, sort = FALSE)
  }
  
  ##### Add CN data if provided
  if ( !is.null(cn_data) ) {
    ##### Get the position of "Diff" column
    col_idx <- grep("Diff", names(group.z), fixed = TRUE)
    
    ##### Now place the CN data after the "Diff" column
    if ( length(genes) > 0 ) {
      group.z <- add_column(group.z, round(cn_data[ group.z$Gene, "CN"], digits=2), .after = col_idx)
      colnames(group.z)[ col_idx+1 ] <- "Patient (CN)"
      cn_range <- base::range(group.z[ ,"Patient (CN)" ], na.rm = TRUE)
      
    } else {
      group.z <- add_column(group.z, "", .after = col_idx)
      colnames(group.z)[ col_idx+1 ] <- "Patient (CN)"
      cn_range <- 0
    }
  }

  ##### Add structural variants results from MANTA
  if ( !is.null(sv_data) && length(genes) > 0 ) {
    ##### NOTE: when merging per-gene exprssion data with SV data from MANTA the "gene" column is used since multiple entires are possible for one gene in MANTA output
    group.z <- merge(group.z, sv_data, by.x="Gene", by.y="Gene", all = TRUE, sort = FALSE)
  }
  
  ##### Add info about known fusion genes
  if ( !is.null(fusion_genes) && length(genes) > 0 ) {
    
    group.z$Fusion_gene <- NA
    group.z$Fusion_gene[ group.z$Gene %in% fusion_genes  ] <- "Yes"
  }
  
  ##### Add cancer gene resources info
  if ( !is.null(cancer_genes) && length(genes) > 0 ) {
    group.z <- merge(group.z, cancer_genes, by.x="Gene", by.y="row.names", all = TRUE, sort = FALSE)
  }
  
  ##### Include only queried genes
  group.z <- group.z[ group.z$Gene %in% genes, ]
  group.z$SYMBOL <- group.z$Gene
  
  ##### Add links to external gene annotation resourses
  if ( ext_links && length(genes) > 0 ) {
    
    ##### Place the external links after the "Diff" column
    ##### Get the position of "Diff" column
    col_idx <- grep("Diff", names(group.z), fixed = TRUE)
    group.z <- add_column(group.z, NA, .after = col_idx)
    names(group.z)[ col_idx+1 ] <- "ext_links"
    
    for ( gene in genes ) {
      ##### Provide link to VICC meta-knowledgebase ( https://search.cancervariants.org )
      group.z$ext_links[ group.z$Gene==gene ] <- paste0("<a href='https://search.cancervariants.org/#", gene, "' target='_blank'>VICC</a>")
      
      ##### Provide link to OncoKB
      if ( !is.null(oncokb_annot) ) {
        if ( gene %in% rownames(oncokb_annot) & oncokb_annot[gene, "OncoKB"] == "Yes" ) {
          group.z$ext_links[ group.z$Gene == gene ] <- paste( group.z$ext_links[ group.z$Gene==gene ] , paste0("<a href='http://oncokb.org/#/gene/", gene, "' target='_blank'>OncoKB</a>"), sep = ", ")
        }
      }
      
      ##### Provide link to CIViC database druggable genes ( https://civicdb.org )
      if ( gene %in% caner_genes_annot.list[["civic_clin_evid"]]$gene ) {
        group.z$ext_links[ group.z$Gene==gene ] <- paste( group.z$ext_links[ group.z$Gene==gene ] , paste0("<a href='", unique(caner_genes_annot.list[["civic_clin_evid"]][ caner_genes_annot.list[["civic_clin_evid"]]$gene == gene , "gene_civic_url"]), "' target='_blank'>CIViC</a>"), sep = ", ")
      }
    }
    
    names(group.z) <- gsub("ext_links", "External resources", names(group.z))
  }
  
  ##### Attach links to GeneCards and Ensembl (if provided). Here we assume that gene names are
  for ( gene in genes ) {
    if ( "ENSEMBL" %in% names(group.z) ) {
        if ( !is.na(group.z$ENSEMBL[ group.z$Gene==gene ]) ) {
          
          group.z$ENSEMBL[ group.z$Gene==gene ] <- paste0("<a href='http://ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=", group.z$ENSEMBL[ group.z$Gene==gene], "' target='_blank'>", group.z$ENSEMBL[ group.z$Gene == gene ], "</a>")
      }
    }
    
    group.z$Gene[ group.z$Gene==gene ] <- paste0("<a href='https://www.genecards.org/cgi-bin/carddisp.pl?gene=", gene, "' target='_blank'>", gene, "</a>")
  }

  ##### Order the data by CN values (to allow filtering based on CN information) and then by the highest absolute values for Patient vs [comp_cancer] difference (to allow filtering based on z-score differences)
  if ( !is.null(cn_data) && length(genes) > 0 ) {
    ##### Get the position of "Patient (CN)" column
    col_idx <- grep("Patient (CN)", names(group.z), fixed = TRUE)
    group.z <- group.z[ order(abs(group.z[, "Diff"]),  decreasing = TRUE), ]
    group.z <- group.z[ order(group.z[ ,col_idx ],  decreasing = cn_decrease), ]
    
  ##### Order the data by increasing TIER category (to allow filtering based on tier information) and then by the highest absolute values for "Diff" difference (to allow filtering based on z-score differences)
  } else if  ( !is.null(mut_annot) && length(genes) > 0 ) {
    group.z <- group.z[ order(abs(group.z[, "Diff"]),  decreasing = TRUE), ]
    group.z <- group.z[ order(group.z$TIER), ]
    
  ##### Order the data by MANTA increasing Tier (to prioritise SVs, based on https://github.com/AstraZeneca-NGS/simple_sv_annotation/blob/master/simple_sv_annotation.py), event type and then by the highest absolute values for Patient vs [comp_cancer] difference
  } else if  ( !is.null(sv_data) && length(genes) > 0 ) {
    group.z <- group.z[ order(abs(group.z[, "Diff"]),  decreasing = TRUE), ]
    group.z <- group.z[ order(group.z$"Fusion genes",  decreasing = TRUE), ]
    group.z <- group.z[ order(group.z$Tier), ]
    
  ##### Otherwise order table by the highest absolute values for Patient vs [comp_cancer] difference
  } else if ( length(genes) > 0 ) {
    group.z <- group.z[ order(abs(group.z[, "Diff"]),  decreasing = TRUE), ]
  }
  
  ##### Remove the internal reference cohort column if the patient samples origins from other tissue. Of note, the internal reference cohort was only used to process the in-house data (including the investigated patient sample) and to correct batch-effects
  if ( comp_cancer != int_cancer ) {
      group.z <- group.z[ , names(group.z) %!in% int_cancer ]
      targets.list[ match(int_cancer, targets.list) ] <- "Patient"
      
      ##### Get the position of "Diff" column
      diff_col_idx <- grep("Diff", names(group.z), fixed = TRUE)
      
  } else {
      ##### Get the position of "Diff" column
      diff_col_idx <- grep("Diff", names(group.z), fixed = TRUE)
      names(group.z)[ match("Diff", names(group.z)) ] <- paste0("Patient vs ", comp_cancer)
  }
  
  ##### Limit the ordered table to maximum of 2000 entries if "keep_all" is set to FALSE (default)
  if ( nrow(group.z) > 2000 && !keep_all ) {
    group.z <- group.z[ 1:2000, ]
  }
  
  ##### Define table height
  if ( nrow(group.z) == 2 ) {
    table_height <- 230
    scrollY <- "67px"
  } else {
    scrollY <- "167px"
    table_height <- 318
  }
  
  ##### Generate a table with genes annotations and coloured expression values in each group
  if ( !is.null(cn_data) ) {
    dt.table <- DT::datatable( data = group.z[, names(group.z) %!in% c("SYMBOL", "SD")], filter="none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, scrollCollapse = TRUE, deferRender = TRUE, scrollY = scrollY, scroller = TRUE), width = 800, height = table_height, caption = htmltools::tags$caption( style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) %>%
      DT::formatStyle( columns = names(group.z)[names(group.z) %!in% c("SYMBOL", "SD")], `font-size` = '12px', 'text-align' = 'center' ) %>%
      
      ##### Colour cells according to the expression values quantiles in each group
      DT::formatStyle(columns = targets.list[1], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[1]]], clrs.q[[targets.list[1]]])) %>%
      DT::formatStyle(columns = targets.list[2], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[2]]], clrs.q[[targets.list[2]]])) %>%
      DT::formatStyle(columns = targets.list[3], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[3]]], clrs.q[[targets.list[3]]])) %>%
      DT::formatStyle(columns = names(group.z)[diff_col_idx], 
                      backgroundColor = DT::styleInterval(brks.q[["Diff"]], clrs.q[["Diff"]])) %>%
      DT::formatStyle(columns = "Patient (CN)", background = DT::styleColorBar(cn_range, 'lightblue'), backgroundSize = '98% 88%', backgroundRepeat = 'no-repeat', backgroundPosition = 'center')
    
  ##### Generate a table with genes annotations and coloured expression values in each group
  } else {
    dt.table <- DT::datatable( data = group.z[, names(group.z) %!in% c("SYMBOL", "SD")], filter="none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, scrollCollapse = TRUE, deferRender = TRUE, scrollY = scrollY, scroller = TRUE), width = 800, height = table_height, caption = htmltools::tags$caption( style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) %>%
      DT::formatStyle( columns = names(group.z)[names(group.z) %!in% c("SYMBOL", "SD")], `font-size` = '12px', 'text-align' = 'center' ) %>%
      
      ##### Colour cells according to the expression values quantiles in each group
      DT::formatStyle(columns = targets.list[1], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[1]]], clrs.q[[targets.list[1]]])) %>%
      DT::formatStyle(columns = targets.list[2], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[2]]], clrs.q[[targets.list[2]]])) %>%
      DT::formatStyle(columns = targets.list[3], 
                      backgroundColor = DT::styleInterval(brks.q[[targets.list[3]]], clrs.q[[targets.list[3]]])) %>%
      DT::formatStyle(columns = names(group.z)[diff_col_idx], 
                      backgroundColor = DT::styleInterval(brks.q[["Diff"]], clrs.q[["Diff"]]))
  }
  
  ##### Clean the space and return output
  rm(genes, data, cn_data, sv_data, targets, sampleName, genes_annot, oncokb_annot, cancer_genes, mut_annot, fusion_genes, genes.absent, targets.list, group.stats, brks.q, clrs.q)
  
  return( list(dt.table,  group.z) )
}

##### Generate table with drugs targeting selected set of genes using info from CIViC database (https://civicdb.org/)
civicDrugTable <- function(genes, civic_var_summaries, civic_clin_evid, evid_type = "Predictive", var_type = NULL) {
  
  ##### Initialize data frame to the about drug-target info from CIViC
  drug.info <- setNames(data.frame(matrix(ncol = 18, nrow = 0)), c("Gene", "Variant", "variant_types", "drugs", "nct_ids", "evidence_level", "evidence_type", "evidence_direction", "clinical_significance", "rating", "civic_actionability_score", "Disease", "phenotypes", "pubmed_id", "variant_origin", "representative_transcript", "representative_transcript2", "last_review_date"))
  
  evid_levels <- list("A" = "A: Validated association", "B" = "B: Clinical evidence", "C" = "C: Case study", "D" = "D: Preclinical evidence", "E" = "E: Inferential association")
  
  ##### Loop thourgh each gene and check if they are druggable
  for ( gene in genes) {
    ##### Get summary info about druggable genes
    if ( gene %in% civic_clin_evid$gene ) {
      ##### Extract info about all reported variants's clinical evidence for queried gene
      clin.evid.info <- civic_clin_evid[ civic_clin_evid$gene == gene , ]

      ##### Use more descriptive evidence level info
      for ( level in unique(clin.evid.info$evidence_level) ) {
        clin.evid.info$evidence_level[ clin.evid.info$evidence_level == level ] <- evid_levels[[ level ]]
      }
      
      ##### Subset table to include only variants with the evidence type of interest
      clin.evid.info <- clin.evid.info[ clin.evid.info$evidence_type == evid_type,  ]
        
      if ( nrow(clin.evid.info) > 0 ) {
        ##### Provide link to CIViC clinical evidence summary
        clin.evid.info$drugs <- paste0("<a href='", clin.evid.info$evidence_civic_url, "' target='_blank'>", clin.evid.info$drugs, "</a>")
        
        ##### Provide link to CIViC clinical evidence summary
        clin.evid.info$evidence_type <- paste0("<a href='", clin.evid.info$evidence_civic_url, "' target='_blank'>", clin.evid.info$evidence_type, "</a>")
        
        ##### Provide link to CIViC gene summary
        clin.evid.info$gene_civic_url <- paste0("<a href='", clin.evid.info$gene_civic_url, "' target='_blank'>", gene, "</a>")
        names(clin.evid.info)[ names(clin.evid.info) =="gene_civic_url" ] <- "Gene"
        
        ##### Provide link to CIViC variants summary
        clin.evid.info$variant_civic_url <- paste0("<a href='", clin.evid.info$variant_civic_url, "' target='_blank'>", clin.evid.info$variant, "</a>")
        names(clin.evid.info)[ names(clin.evid.info) =="variant_civic_url" ] <- "Variant"
        
        ##### Provide link to ClinicalTrials.gov variants summary based on NCT IDs
        for ( nct_id in clin.evid.info$nct_ids ) {
          if ( !is.empty(nct_id) ) {
            
            ##### Deal with multiple NCT IDs (separated by comma)
            nct_id_url <- gsub(" '" , "'", paste(gsub("/ " , "/", paste("<a href='https://clinicaltrials.gov/ct2/show/", unlist(strsplit(nct_id, split=",", fixed=TRUE)) , "' target='_blank'>", unlist(strsplit(nct_id, split=",", fixed=TRUE)), "</a>")), collapse = ", "))
            clin.evid.info$nct_ids[ clin.evid.info$nct_ids==nct_id ] <- nct_id_url
          }
        }
        
        ##### Provide link to PubMed variants summary
        clin.evid.info$pubmed_id <- paste0("<a href='https://www.ncbi.nlm.nih.gov/pubmed/", clin.evid.info$pubmed_id, "' target='_blank'>", clin.evid.info$pubmed_id, "</a>")
        
        ##### Provide link to Disease Ontology
        clin.evid.info$doid <- paste0("<a href='http://www.disease-ontology.org/?id=DOID:", clin.evid.info$doid, "' target='_blank'>", clin.evid.info$disease, "</a>")
        names(clin.evid.info)[ names(clin.evid.info) =="doid" ] <- "Disease"
        
        ##### Extract info about all variants it that gene
        var.info <- civic_var_summaries[ civic_var_summaries$gene == gene , ]
        var.info <- var.info[, c("variant", "variant_types", "civic_actionability_score")]
        var.info[,"variant_types"] <- gsub("_", " ", var.info[,"variant_types"])
        var.info[,"variant_types"] <- gsub(",", ", ", var.info[,"variant_types"])
        
        ##### Merge about all variants it that gene and clinical evidence info
        clin.evid.info <- merge(clin.evid.info, var.info, by = "variant", all.x = TRUE)
        
        ##### Filter drug matching info depending on the variant type
        var_type.keep <- NULL
        
        ##### Remove entries containing "EXPRESSION", "AMPLIFICATION", "DELETION", "METHYLATION", "WILD TYPE", "FUSION", "COPY", "REARRANGEMENT", "PHOSPHORYLATION", "TRANSCRIPT", "GAIN", "LOSS"
        if ( !is.null(var_type) && var_type == "mutation" ) {
          var_type.keep <- c(var_type.keep, grep( "EXPRESSION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "AMPLIFICATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "DELETION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "METHYLATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "WILD TYPE", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "FUSION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "REARRANGEMENT", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "PHOSPHORYLATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "COPY", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "TRANSCRIPT", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "GAIN", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "LOSS", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          
          clin.evid.info <- clin.evid.info[ -c(unique(var_type.keep)), ]
          
        ##### Keep only entries containing "EXPRESSION", "FUSION", "TRANSCRIPT", "ALTERATION"
        } else if ( !is.null(var_type) && var_type == "expression" ) {
          var_type.keep <- c(var_type.keep, grep( "EXPRESSION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "FUSION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "TRANSCRIPT", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "ALTERATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          
          clin.evid.info <- clin.evid.info[ c(unique(var_type.keep)), ]
          
        ##### Keep only entries containing "FUSION", "ALTERATION", "[gene]-", "-[gene]"
        } else if ( !is.null(var_type) && var_type == "fusion" ) {
          var_type.keep <- c(var_type.keep, grep( "FUSION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( paste0(gene, "-"), clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( paste0("-", gene), clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "ALTERATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          
          clin.evid.info <- clin.evid.info[ c(unique(var_type.keep)), ]
        
        ##### Keep only entries containing "AMPLIFICATION", "COPY", "GAIN", "ALTERATION"
        } else if ( !is.null(var_type) && var_type == "copy_gain" ) {
          var_type.keep <- c(var_type.keep, grep( "AMPLIFICATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "COPY", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "GAIN", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "ALTERATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          
          clin.evid.info <- clin.evid.info[ c(unique(var_type.keep)), ]
        
        ##### Keep only entries containing "DELETION", "COPY", "LOSS", "ALTERATION"
        } else if ( !is.null(var_type) && var_type == "copy_loss" ) {
          var_type.keep <- c(var_type.keep, grep( "DELETION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "COPY", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "LOSS", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          var_type.keep <- c(var_type.keep, grep( "ALTERATION", clin.evid.info$variant, invert=FALSE, ignore.case=TRUE))
          
          clin.evid.info <- clin.evid.info[ c(unique(var_type.keep)), ]
        }
      }
      
      if ( nrow(clin.evid.info) > 0 ) {
        ##### Subset table to include only most important info
        clin.evid.info <- clin.evid.info[ , names(drug.info)]
        
        ##### Add drugs info for subsequent gene
        drug.info <- rbind(drug.info, clin.evid.info)
      }
    }
  }
  
  ##### Use more friendly column names for the table
  names(drug.info) <- c("Gene", "Variant", "Variant type", "Drugs", "Clinical trials", "Evidence level", "Evidence type", "Evidence direction", "Clinical significance", "Trust rating", "Actionability score", "Disease", "Phenotypes", "PubMed ID",  "Variant origin", "Representative transcript", "Representative transcript 2", "Review date")
  
  ##### Limit the info to fewer columns
  drug.info <- drug.info[ , c("Gene", "Variant", "Variant type", "Drugs", "Clinical trials", "Evidence level", "Evidence direction", "Clinical significance", "Trust rating", "Actionability score", "Disease", "Phenotypes", "PubMed ID",  "Representative transcript", "Representative transcript 2")] 
  
  ##### Generate a table
  dt.table <- DT::datatable( data = drug.info, filter = "none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = "167px", scroller = TRUE), width = 800, caption = htmltools::tags$caption(style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) %>%
    DT::formatStyle( columns = names(drug.info), `font-size` = '12px', 'text-align' = 'center' ) %>%
    ##### Colour cells according to evidence level and trust rating
    DT::formatStyle(columns = "Evidence level", 
                    backgroundColor = DT::styleEqual(c("A: Validated association", "B: Clinical evidence", "C: Case study", "D: Preclinical evidence", "E: Inferential association"), c("mediumseagreen", "deepskyblue", "mediumpurple", "darkorange", "coral")) )  %>%
    DT::formatStyle(columns = "Trust rating", 
                    backgroundColor = DT::styleEqual(c(1:5), c("coral", "azure", "lightskyblue", "palegreen", "mediumseagreen")) )
  
  ##### Clean the space and return output
  rm(genes, civic_var_summaries, civic_clin_evid, evid_levels, clin.evid.info, var.info, var_type.keep)
  return( list(dt.table,  drug.info) )
}

##### Code from UMCCRISE to prioritise SV events (version for "-sv-prioritize-manta-pass.tsv" files, https://github.com/umccr/umccrise/blob/master/umccrise/rmd_files/index.Rmd)
sv_prioritize_short <- function(sv_file) {
  
  sv_all = NULL
  
  if (length(readLines(con = sv_file, n = 2)) > 1) {
    sv_all <- readr::read_tsv(sv_file, col_names = TRUE) %>%
      tidyr::unnest(annotation = strsplit(annotation, ',')) %>% # Unpack multiple annotations per region
      tidyr::separate(annotation,
                      c('Event', 'Annotation', 'Gene', 'Transcript', 'Priority', 'Tier'),
                      sep = '\\|', convert = TRUE) %>% # Unpack annotation columns %>%
      dplyr::mutate(start = format(start, big.mark = ',', trim = T),
                    end = format(end, big.mark = ',', trim = T)) %>% 
      dplyr::mutate(Location = str_c(chrom, ':', start, sep = ''),
                    Location = ifelse(is.na(end), Location, str_c(Location))) %>%
      dplyr::mutate(SR = split_read_support, PR = paired_support_PR) %>%
      dplyr::select(Location, Gene, Priority, Tier, Annotation, Event, SR, PR) %>%
      dplyr::distinct()
      # dplyr::mutate(Chrom = factor(Chrom, levels = c(1:22, "X", "Y", "MT")))
  } else {
    warning('No prioritized events detected')
  }
  return( sv_all )
}

##### Code from UMCCRISE to prioritise SV events (version for "-manta.tsv" files https://github.com/umccr/umccrise/blob/master/umccrise/rmd_files/index.Rmd
sv_prioritize <- function(sv_file) {
  
  sv_all = NULL

  if (length(readLines(con = sv_file, n = 2)) > 1) {
    
    ##### Due to changes in PURPLE output format there are two expected column names combinations
    if ( all(c("AF_BPI", "AF_PURPLE", "CN_PURPLE", "CN_change_PURPLE", "Ploidy_PURPLE") %in% names(read_tsv(sv_file, col_names = TRUE))) ) {
    
      sv_all <- readr::read_tsv(sv_file, col_names = TRUE) %>%
        dplyr::select(-caller, -sample) %>% 
        split_sv_field(AF_BPI, is_pct = T) %>% 
        split_sv_field(AF_PURPLE, is_pct = T) %>% 
        split_sv_field(CN_PURPLE) %>% 
        split_sv_field(CN_change_PURPLE) %>% 
        dplyr::mutate(
          Ploidy_PURPLE = as.double(Ploidy_PURPLE),
          Ploidy_PURPLE = format(Ploidy_PURPLE, nsmall = 2)
        ) %>% 
        tidyr::separate(split_read_support, c("SR (ref)", "SR (alt)"), ",") %>% 
        dplyr::mutate(SR = as.integer(`SR (alt)`)) %>% 
        tidyr::separate(paired_support_PR, c("PR (ref)", "PR (alt)"), ",") %>% 
        dplyr::mutate(PR = as.integer(`PR (alt)`)) %>% 
        tidyr::separate(paired_support_PE, c("PE (ref)", "PE (alt)"), ",") %>% 
        dplyr::mutate(PE = as.integer(`PE (alt)`)) %>% 
        
        dplyr::filter(svtype != 'BND' | is.na(SR) | PR>SR) %>%  # remove BND with split read support higher than paired
        tidyr::unnest(annotation = strsplit(annotation, ',')) %>%  # Unpack multiple annotations per region
        tidyr::separate(annotation,
                        c('Event', 'Effect', 'Genes', 'Transcript', 'Detail', 'Tier'),
                        sep = '\\|', convert = TRUE) %>%  # Unpack annotation columns
        dplyr::mutate(start = format(start, big.mark = ',', trim = T),
                      end = format(end, big.mark = ',', trim = T)) %>% 
        dplyr::mutate(location = str_c(chrom, ':', start, sep = ''),
                      location = ifelse(is.na(end), location, str_c(location))) %>% 
        dplyr::arrange(Tier, Effect, desc(AF_PURPLE), Genes) %>% 
        dplyr::mutate(Gene = subset_genes(Genes, c(1, 2)),
                      Gene = ifelse((str_split(Genes, '&') %>% map_int(length)) > 2,
                                    str_c(Gene, '...', sep = ', '),
                                    Gene),
                      `Other affected genes` = subset_genes(Genes, -c(1,2)) %>% str_replace_all('&', ', '),
                      Gene = ifelse(str_detect(Effect, "gene_fusion"),
                                    Gene,
                                    Gene %>% str_replace_all('&', ', '))
                      ) %>% 
        separate(Effect, c("Effect", "Other effects"), sep = '&') %>% 
        dplyr::select(Tier = tier, Event = svtype, Gene, Effect = Effect, Detail = Detail, Location = location, AF = AF_PURPLE, `CN chg` = CN_change_PURPLE, SR, PR, CN = CN_PURPLE, Ploidy = Ploidy_PURPLE, PURPLE_status, `SR (ref)`, `PR (ref)`, PE, `PE (ref)`, `Somatic score` = somaticscore, Transcript = Transcript, `Other effects`, `Other affected genes`, `AF at breakpoint 1` = AF_PURPLE1, `AF at breakpoint 2` = AF_PURPLE2, `CN at breakpoint 1` = CN_PURPLE1, `CN at breakpoint 2` = CN_PURPLE2, `CN change at breakpoint 1` = CN_change_PURPLE1, `CN change at breakpoint 2` = CN_change_PURPLE2, `AF before adjustment, bp 1` = AF_BPI1, `AF before adjustment, bp 2` = AF_BPI2
        ) %>%
        dplyr::distinct()
        # dplyr::mutate(chr = factor(chr, levels = c(1:22, "X", "Y", "MT"))) %>%
      
    } else {
         sv_all <- readr::read_tsv(sv_file, col_names = TRUE) %>%
        dplyr::select(-caller, -sample) %>% 
        split_sv_field(BPI_AF, is_pct = T) %>% 
        split_sv_field(AF, is_pct = T) %>% 
        split_sv_field(CN) %>% 
        split_sv_field(CN_change) %>% 
        dplyr::mutate(
          Ploidy = as.double(Ploidy),
          Ploidy = format(Ploidy, nsmall = 2)
        ) %>% 
        tidyr::separate(split_read_support, c("SR (ref)", "SR (alt)"), ",") %>% 
        dplyr::mutate(SR = as.integer(`SR (alt)`)) %>% 
        tidyr::separate(paired_support_PR, c("PR (ref)", "PR (alt)"), ",") %>% 
        dplyr::mutate(PR = as.integer(`PR (alt)`)) %>% 
        tidyr::separate(paired_support_PE, c("PE (ref)", "PE (alt)"), ",") %>% 
        dplyr::mutate(PE = as.integer(`PE (alt)`)) %>% 
        
        dplyr::filter(svtype != 'BND' | is.na(SR) | PR>SR) %>%  # remove BND with split read support higher than paired
        tidyr::unnest(annotation = strsplit(annotation, ',')) %>%  # Unpack multiple annotations per region
        tidyr::separate(annotation,
                        c('Event', 'Effect', 'Genes', 'Transcript', 'Detail', 'Tier'),
                        sep = '\\|', convert = TRUE) %>%  # Unpack annotation columns
        dplyr::mutate(start = format(start, big.mark = ',', trim = T),
                      end = format(end, big.mark = ',', trim = T)) %>% 
        dplyr::mutate(location = str_c(chrom, ':', start, sep = ''),
                      location = ifelse(is.na(end), location, str_c(location))) %>% 
        dplyr::arrange(Tier, Effect, desc(AF), Genes) %>% 
        dplyr::mutate(Gene = subset_genes(Genes, c(1, 2)),
                      Gene = ifelse((str_split(Genes, '&') %>% map_int(length)) > 2,
                                    str_c(Gene, '...', sep = ', '),
                                    Gene),
                      `Other affected genes` = subset_genes(Genes, -c(1,2)) %>% str_replace_all('&', ', '),
                      Gene = ifelse(str_detect(Effect, "gene_fusion"),
                                    Gene,
                                    Gene %>% str_replace_all('&', ', '))
                      ) %>% 
        separate(Effect, c("Effect", "Other effects"), sep = '&') %>% 
        dplyr::select(Tier = tier, Event = svtype, Gene, Effect = Effect, Detail = Detail, Location = location, AF, `CN chg` = CN_change, SR, PR, CN, Ploidy, PURPLE_status, `SR (ref)`, `PR (ref)`, PE, `PE (ref)`, `Somatic score` = somaticscore, Transcript = Transcript, `Other effects`, `Other affected genes`, `AF at breakpoint 1` = AF1, `AF at breakpoint 2` = AF2, `CN at breakpoint 1` = CN1, `CN at breakpoint 2` = CN2, `CN change at breakpoint 1` = CN_change1, `CN change at breakpoint 2` = CN_change2, `AF before adjustment, bp 1` = BPI_AF1, `AF before adjustment, bp 2` = BPI_AF2
        ) %>%
        dplyr::distinct()
        # dplyr::mutate(chr = factor(chr, levels = c(1:22, "X", "Y", "MT"))) %>%
    }
  } else {
    warning('No prioritized events detected')
  }
  return( sv_all )
}

##### Function used in the "sv_prioritize" function
subset_genes = function(genes, ind) {
  genes %>% str_split('&') %>% map(~ .[ind] %>% replace("", NA) %>% .[!is.na(.)]) %>% map_chr(~ ifelse(length(.) > 0, str_c(., collapse = '&'), ""))
}

##### Function used in the "sv_prioritize" function
format_val = function(val, is_pct = F) {
  ifelse(!is.na(val), 
         format(val,  digits = 1) %>% str_c(ifelse(is_pct, "%", "")), NA)
}

##### Function used in the "sv_prioritize" function 
split_sv_field = function(.data, field, is_pct = F) {
  f_q = rlang::enquo(field)
  f_str = rlang::quo_name(f_q)
  f1_str = str_c(f_str, '1')
  f2_str = str_c(f_str, '2')
  f1_q = sym(f1_str)
  f2_q = sym(f2_str)
  .data %>% 
    separate(!!f_q, c(f1_str, f2_str), ",") %>% 
    dplyr::mutate(
      !!f1_q := as.double(!!f1_q) * ifelse(is_pct, 100, 1),
      !!f2_q := as.double(!!f2_q) * ifelse(is_pct, 100, 1),
      !!f_q  := (!!f1_q + ifelse(is.na(!!f2_q), !!f1_q, !!f2_q)) / 2,
      !!f_q  := format_val(!!f_q, is_pct),
      !!f1_q := format_val(!!f1_q, is_pct),
      !!f2_q := format_val(!!f2_q, is_pct)
    )
}

CapStr <- function(y) {
  c <- strsplit(y, " ")[[1]]
  paste(toupper(substring(c, 1,1)), substring(c, 2),
      sep="", collapse=" ")
}

##### A wrapper to saveWidget which compensates for arguable BUG in saveWidget which requires `file` to be in current working directory (see post https://github.com/ramnathv/htmlwidgets/issues/299 )
saveWidgetFix <- function ( widget, file, ...) {
  wd<-getwd()
  on.exit(setwd(wd))
  outDir<-dirname(file)
  file<-basename(file)
  setwd(outDir);
  htmlwidgets::saveWidget(widget,file=file,...)
}

##### Define function for generating spider web plots to present immunogram genes (code from http://www.statisticstoproveanything.com/2013/11/spider-web-plots-in-r.html)
# data - data.frame or matrix
# data.row - row of data to plot (if NULL uses row 1)
# y.cols - columns of interest (if NULL it selects all numeric columns)
# main - title of plot (if NULL then rowname of data)
# add - whether the plot should be added to an existing plot
# col - color of the data line
# lty - lty of the data line

webplot = function(data, data.row = NULL, y.cols = NULL, main = NULL, add = F, 
    col = "red", lty = 1, scale = T) {
    if (!is.matrix(data) & !is.data.frame(data)) 
        stop("Requires matrix or data.frame")
    if (is.null(y.cols)) 
        y.cols = colnames(data)[sapply(data, is.numeric)]
    if (sum(!sapply(data[, y.cols], is.numeric)) > 0) {
        out = paste0("\"", colnames(data)[!sapply(data, is.numeric)], "\"", 
            collapse = ", ")
        stop(paste0("All y.cols must be numeric\n", out, " are not numeric"))
    }
    if (is.null(data.row)) 
        data.row = 1
    if (is.character(data.row)) 
        if (data.row %in% rownames(data)) {
            data.row = which(rownames(data) == data.row)
        } else {
            stop("Invalid value for data.row:\nMust be a valid rownames(data) or row-index value")
        }
    if (is.null(main)) 
        main = rownames(data)[data.row]
    if (scale == T) {
        data = scale(data[, y.cols])
        data = apply(data, 2, function(x) x/max(abs(x)))
    }
    data = as.data.frame(data)
    n.y = length(y.cols)
    min.rad = 360/n.y
    polar.vals = (90 + seq(0, 360, length.out = n.y + 1)) * pi/180

    if (add == F) {
        plot(0, xlim = c(-2.2, 2.2), ylim = c(-2.2, 2.2), type = "n", axes = F, 
            xlab = "", ylab = "")
        title(main)
        lapply(polar.vals, function(x) lines(c(0, 2 * cos(x)), c(0, 2 * sin(x))))
        lapply(1:n.y, function(x) text(2.15 * cos(polar.vals[x]), 2.15 * sin(polar.vals[x]), 
            y.cols[x], cex = 0.8))

        lapply(seq(0.5, 2, 0.5), function(x) lines(x * cos(seq(0, 2 * pi, length.out = 100)), 
            x * sin(seq(0, 2 * pi, length.out = 100)), lwd = 0.5, lty = 2, col = "gray60"))
        lines(cos(seq(0, 2 * pi, length.out = 100)), sin(seq(0, 2 * pi, length.out = 100)), 
            lwd = 1.2, col = "gray50")
    }

    r = 1 + data[data.row, y.cols]
    xs = r * cos(polar.vals)
    ys = r * sin(polar.vals)
    xs = c(xs, xs[1])
    ys = c(ys, ys[1])
    lines(xs, ys, col = col, lwd = 2, lty = lty)
    
    #### Clear plots to free up some memory
    if(!is.null(dev.list())) invisible(dev.off())
}
```

```{r plot_thumbnail, comment=NA, message=FALSE, warning=FALSE}
##### Generate a full-resolution pdf image before generating a small image in the chunk
knitr::knit_hooks$set(plot = allow_thumbnails)
```

```{r load_libraries, warning=FALSE}
##### Load libraries
suppressMessages(library(edgeR))
suppressMessages(library(limma))
suppressMessages(library(EDASeq))
suppressMessages(library(preprocessCore))
suppressMessages(library(rapportools))
suppressMessages(library(tximport))
suppressMessages(library(rhdf5))
suppressMessages(library(openxlsx))
suppressMessages(library(readr))
suppressMessages(library(tidyverse))
suppressMessages(library(dplyr))
suppressMessages(library(tidyr))
suppressMessages(library(rlang))
suppressMessages(library(DT))
suppressMessages(library(matrixStats))
suppressMessages(library(tibble))
suppressMessages(library(knitr))
suppressMessages(library(scales))
suppressMessages(library(RCircos))
suppressMessages(library(ggplot2))
suppressMessages(library(ggforce))
suppressMessages(library(pdftools))
suppressMessages(library(png))
suppressMessages(library(htmltools))
suppressMessages(library(htmlwidgets))
suppressMessages(library(devtools))
suppressMessages(library(lares))
suppressMessages(library(package=paste0("EnsDb.Hsapiens.v", params$ensembl_version), character.only = TRUE))
suppressMessages(library(package=paste0("BSgenome.Hsapiens.UCSC.hg", params$ucsc_genome_assembly), character.only = TRUE))
```

```{r prepare_parameters, message=FALSE, warning=FALSE}
##### Define Z-transformation direction
if (tolower(params$scaling) == "gene-wise"){
  scaling <- "gene-wise"
} else {
  scaling <- "group-wise"
}
```

```{r tx2ensembl, comment = NA, message=FALSE, warning=FALSE}
##### Annotate transcripts with gene IDs
edb <- eval(parse(text = paste0("EnsDb.Hsapiens.v", params$ensembl_version)))
  
##### Get keytypes for gene SYMBOL
keys <- keys(edb, keytype="GENEID")
  
##### Get genes genomic coordiantes
tx2ensembl <- ensembldb::select(edb, keys=keys, columns=c("TXID", "GENEID"), keytype="GENEID")
names(tx2ensembl) <- gsub("TXID", "tx_name", names(tx2ensembl))
names(tx2ensembl) <- gsub("GENEID", "gene_id", names(tx2ensembl))
  
##### Clean the space
rm(edb, keys)
```

```{r load_ref_data, message=FALSE, warning=FALSE}
##### Load reference datasets
##### Define the reference datasets based on user-defined input
dataset <- toupper(params$dataset)

ref_dataset <- list( "ext_ref" = c(paste0(params$ref_data_dir, "/ref_data/TCGA_", strsplit(dataset, split='-', fixed=TRUE)[[1]][1], "_Counts.exp.gz"), paste0(params$ref_data_dir, "/ref_data/TCGA_", dataset, "_Target.txt"), paste0(strsplit(dataset, split='-', fixed=TRUE)[[1]][1], " (TCGA)")),
                     "int_ref" = c(paste0(params$ref_data_dir, "/ref_data/UMCCR_PDAC_Counts.exp.gz"), paste0(params$ref_data_dir, "/ref_data/UMCCR_PDAC_Target.txt"), "PAAD (UMCCR)")
)

##### Create a list with reference datasets
ref_dataset.list <- vector("list", length(dataset))
names(ref_dataset.list) <- dataset

##### Create a list with various sets of genes
ref_genes <- c("genes_cancer", "genes_oncokb", "genes_immune", "genes_hrd")
ref_genes.list <- vector("list", length(ref_genes))
names(ref_genes.list) <- ref_genes

##### Create a list with cancer genes annotations
caner_genes_annot <- c("oncokb_clin_vars", "oncokb_all_vars")
caner_genes_annot.list <- vector("list", length(caner_genes_annot))
names(caner_genes_annot.list) <- caner_genes_annot

##### Get the subject ID
if ( !is.na(params$subject_id) ) {
  subjectID <- params$subject_id
} else {
  subjectID <- ""
}

if ( !is.null(params$bcbio_rnaseq) ) {
  
  ##### Get patient data dir and sample file name
  dataDir <- params$bcbio_rnaseq
  
  ##### Look at countsFromAbundance parameter to change the method to generate the counts
  txi.kallisto <- tximport(paste0(dataDir, "/kallisto/abundance.tsv"), type = "kallisto", tx2gene = tx2ensembl)
  
  ##### Extract kallisto counts to prepare dataframe
  counts <- as.data.frame(txi.kallisto$counts) %>%
    tibble::rownames_to_column() %>%
    dplyr::rename(count = V1)
  
} else if ( !is.null(params$dragen_rnaseq) ) {
  
  ##### Get patient data dir and sample file name
  dataDir <- paste(params$dragen_rnaseq, "dragen", sep = "/")
  
  ##### Look at countsFromAbundance parameter to change the method to generate the counts
  txi.salmon <- tximport(paste0(dataDir, "/", list.files(dataDir, pattern="\\.sf$")), type = "salmon", tx2gene = tx2ensembl)
  
  ##### Extract salmon counts to prepare dataframe
  counts <- as.data.frame(txi.salmon$counts) %>%
    tibble::rownames_to_column() %>%
    dplyr::rename(count = V1)
}

##### Create directory for results
results_dir <- paste0(params$report_dir, "/", params$sample_name, params$dataset_name_incl, ".results")

if ( !file.exists(results_dir) ) {
  dir.create(results_dir, recursive=TRUE)
}

##### Check if spreadsheet with clinical information exists
clinical_info_file <- params$clinical_info
runClinicalChunk <- FALSE

if ( file.exists(clinical_info_file) ) {
  ref_dataset.list[[dataset]][["clinical_info"]] <- read.xlsx(xlsxFile = clinical_info_file, sheet = 1, colNames = TRUE, rowNames = FALSE, detectDates = TRUE, skipEmptyRows = TRUE, skipEmptyCols = TRUE, check.names = TRUE)
  runClinicalChunk <- TRUE
}

##### Read in selected genes list
ref_genes.list[["genes_cancer"]] <- read.table(paste(params$ref_data_dir, params$genes_cancer, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="")
ref_genes.list[["genes_oncokb"]] <- read.table(paste(params$ref_data_dir, params$oncokb_genes, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="", comment.char = "")
ref_genes.list[["genes_immune"]]$immune_markers <- read.table(paste(params$ref_data_dir, params$genes_immune_markers, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="")
ref_genes.list[["genes_hrd"]] <- read.table(paste(params$ref_data_dir, params$genes_hrd, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="")

if ( params$immunogram ) {
  ref_genes.list[["genes_immune"]]$immunogram <- read.table(paste(params$ref_data_dir, params$genes_immunogram, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="")
}

##### Read in gene fusion data for investigate sample
##### Read in arriba and pizzly fusion calls
##### Check if arriba output file exists
arriba_file <- paste(dataDir, "arriba", "fusions.tsv", sep = "/")
arriba_pdf <- paste(dataDir, "arriba", "fusions.pdf", sep = "/")
runArribaChunk <- FALSE
runFusionChunk <- FALSE

if ( file.exists(arriba_file) ) {
  ref_genes.list[["arriba"]] <- read.table(file = arriba_file, header = TRUE, comment.char = "", quote = "")
  
  ##### Make sure that at least one fusions has been reported by Arriba
  if ( nrow(ref_genes.list[["arriba"]]) > 0 ) {
    
    ##### Convert Arriba pdf booklet with fusion plots to png images
    if ( file.exists(arriba_pdf) ) {
      arriba_plots(arriba_file = arriba_file, arriba_results = ref_genes.list[["arriba"]], results_dir = paste0(results_dir, "/arriba"))
    }
    
    ##### Write list of fusion events for which Arriba plot is available into a file (for PIEdb portal)
    fusion <- gsub(":", ".", c("", paste0(make.names(paste(ref_genes.list[["arriba"]]$X.gene1, ref_genes.list[["arriba"]]$gene2, sep = "__")), "_", ref_genes.list[["arriba"]]$breakpoint1, "-", ref_genes.list[["arriba"]]$breakpoint2)))
    
    write.table(prepare2write(fusion), file = paste0(results_dir, "/", params$sample_name, params$dataset_name_incl, ".RNAseq_report.arriba_fusions.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE, append = FALSE )
  
    runArribaChunk <- TRUE
    runFusionChunk <- TRUE
    
  } else {
    ##### Write list of fusion events for which arriba plot is available into a file (for PIEdb portal)
  write.table(prepare2write(""), file = paste0(results_dir, "/", params$sample_name, params$dataset_name_incl, ".RNAseq_report.arriba_fusions.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE, append = FALSE )
  }
  
} else {
  ##### Write list of fusion events for which arriba plot is available into a file (for PIEdb portal)
  write.table(prepare2write(""), file = paste0(results_dir, "/", params$sample_name, params$dataset_name_incl, ".RNAseq_report.arriba_fusions.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE, append = FALSE )
}

##### Read in dragen fusion calls
##### Check if dragen output file exists
dragen_fusion_file <- paste(dataDir, list.files(dataDir, pattern="\\.fusion_candidates.final$"), sep = "/")
runDragenFusionChunk <- FALSE

if ( !is.null(params$dragen_rnaseq) && file.exists(dragen_fusion_file) ) {
  
  ##### Dragen's fusion output file header starts with '#' hence change the comment indicator option to '^' ( https://stackoverflow.com/questions/27196470/reading-a-line-that-starts-with-a-hash-on-a-txt-file )
  
  dragen_fusion <- read.table(file = dragen_fusion_file[1], header = TRUE, comment.char = '^', quote = "")
  
  ##### Check Dragen's fusion format version
  #####  Dragen's fusion format version 3.9.3
  if ( all(c("X.FusionGene", "Score", "LeftBreakpoint", "RightBreakpoint", "Gene1Location", "Gene2Location", "Gene1Sense", "Gene2Sense", "Gene1Id", "Gene2Id", "NumSplitReads", "NumSoftClippedReads", "NumPairedReads", "ReadNames") %in% colnames(dragen_fusion)) ) {
    colnames(dragen_fusion) <- c("FusionGene", "Score", "LeftBreakpoint", "RightBreakpoint", "Gene1Location", "Gene2Location", "Gene1Sense", "Gene2Sense", "Gene1Id", "Gene2Id", "NumSplitReads", "NumSoftClippedReads", "NumPairedReads", "ReadNames")
  } else if ( all(c("X.FusionGene", "Score", "LeftBreakpoint", "RightBreakpoint", "ReadNames") %in% colnames(dragen_fusion)) ) {
    colnames(dragen_fusion) <- c("FusionGene", "Score", "LeftBreakpoint", "RightBreakpoint", "ReadNames")
  }
  
  dragen_fusion_genes <- dragen_fusion %>%
    tidyr::separate(col = FusionGene, into = c("gene1", "gene2"), sep = "--")
  
  ref_genes.list[["dragenFusion"]] <- dragen_fusion_genes
  
  runDragenFusionChunk <- TRUE
  runFusionChunk <- TRUE
}


##### Read in pizzly fusion calls
##### Check if pizzly output file exists
pizzly_file <- paste(dataDir, "pizzly", paste0(params$sample_name, "-flat.tsv"), sep = "/")
pizzly_file_filtered <- paste(dataDir, "pizzly", paste0(params$sample_name, "-flat-filtered.tsv"), sep = "/")
runPizzlyChunk <- FALSE

if ( !is.null(params$bcbio_rnaseq) &&  file.exists(pizzly_file) ) {
  ref_genes.list[["pizzly"]] <- read.table(file = pizzly_file, header = TRUE, quote = "")
  runPizzlyChunk <- TRUE
  runFusionChunk <- TRUE
} else if ( file.exists(pizzly_file_filtered) ) {
  ref_genes.list[["pizzly"]] <- read.table(file = pizzly_file_filtered, header = TRUE, quote = "")
  runPizzlyChunk <- TRUE
  runFusionChunk <- TRUE
}

##### Read in mutation data for investigate sample
##### Get the genomic output data from umccrise
if ( !is.null(params$umccrise) ) {
  umccrise <- unlist(strsplit(params$umccrise, split='/', fixed=TRUE))
  umccrise <- umccrise[length(umccrise)]
  
  ##### Check if PCGR (mutation) output file exists
  runPcgrChunk <- TRUE
  
  if ( file.exists(paste(params$umccrise, "small_variants", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")) ) {
    pcgr_file <- paste(params$umccrise, "small_variants", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")
  } else if ( file.exists(paste(params$umccrise, "..", "work", umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")) ) {
    pcgr_file <- paste(params$umccrise, "..", "work", umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")
  } else if ( file.exists(paste(params$umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")) ) {
    pcgr_file <- paste(params$umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr.snvs_indels.tiers.tsv"), sep = "/")
  } else if ( file.exists(paste(params$umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr_acmg.grch37.snvs_indels.tiers.tsv"), sep = "/")) ) {
    pcgr_file <- paste(params$umccrise, "pcgr", paste0(umccrise, "-somatic.pcgr_acmg.grch37.snvs_indels.tiers.tsv"), sep = "/")
  } else {
    runPcgrChunk <- FALSE
  }
  
  if ( runPcgrChunk ) {
    ref_genes.list[["pcgr"]] <- read.table(pcgr_file, sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, fill = TRUE, quote = "")
    
    ##### Simplify the variants types
    ref_genes.list[["pcgr"]]$CONSEQUENCE <- gsub("_variant", "", ref_genes.list[["pcgr"]]$CONSEQUENCE)
    ref_genes.list[["pcgr"]]$CONSEQUENCE <- gsub("_", " ", ref_genes.list[["pcgr"]]$CONSEQUENCE)
    
    ##### Simplify tiers' annotations and AFs
    ref_genes.list[["pcgr"]]$TIER <- gsub("TIER ", "", ref_genes.list[["pcgr"]]$TIER)
    ref_genes.list[["pcgr"]]$AF_TUMOR <- round(ref_genes.list[["pcgr"]]$AF_TUMOR, digits = 2)
  } else {
    ref_genes.list[["pcgr"]] <- NULL
  }
  
  ##### Check if purple (CN) output file exists
  purple_file_1 <- paste(params$umccrise, "purple", paste0(umccrise, ".purple.gene.cnv"), sep = "/")
  purple_file_2 <- paste(params$umccrise, "purple", paste0(umccrise, ".purple.cnv.gene.tsv"), sep = "/")
  runPurpleChunk <- TRUE
  
  if ( file.exists(purple_file_1) ) {
    ref_genes.list[["purple"]] <- read.table(purple_file_1, sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, fill = TRUE, quote = "")
  } else if ( file.exists(purple_file_2) ) {
    ref_genes.list[["purple"]] <- read.table(purple_file_2, sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, fill = TRUE, quote = "")
    colnames(ref_genes.list[["purple"]]) <- sapply(colnames(ref_genes.list[["purple"]]), CapStr)
  } else {
    ref_genes.list[["purple"]] <- NULL
    runPurpleChunk <- FALSE
  }
  
  ##### Check if manta (structural variants (SVs)) file exists
  sv_file_1 <- paste(params$umccrise, "structural", paste0(umccrise, "-sv-prioritize-manta-pass.tsv"), sep = "/")
  sv_file_2 <- paste(params$umccrise, "structural", paste0(umccrise, "-manta.tsv"), sep = "/")
  runSVsChunk <- TRUE
  
  if ( file.exists(sv_file_1) ) {
    ref_genes.list[["manta"]] <- sv_prioritize_short(sv_file_1)
  } else if ( file.exists(sv_file_2) ) {
    ref_genes.list[["manta"]] <- sv_prioritize(sv_file_2)
    ref_genes.list[["manta"]] <- ref_genes.list[["manta"]][, c("Tier", "Event", "Gene", "Effect", "Detail", "Location", "AF", "CN chg", "SR", "PR", "CN", "Ploidy", "Transcript", "Other effects")]
    
    ##### Check if there are any SVs
    if ( !is.null(ref_genes.list[["manta"]]) ) {
      
      ##### Omit SVs without assigned gene
      ref_genes.list[["manta"]] <- ref_genes.list[["manta"]][ ref_genes.list[["manta"]]$Gene != "",  ]
    } else {
      ##### Create empty dataframe
      ref_genes.list[["manta"]] <- data.frame(matrix(ncol = 14, nrow = 0))
      colnames(ref_genes.list[["manta"]]) <- c("Tier", "Event", "Gene", "Effect", "Detail", "Location", "AF", "CN chg", "SR", "PR", "CN", "Ploidy", "Transcript", "Other effects")
    }
    
  } else {
    ref_genes.list[["manta"]] <- NULL
    runSVsChunk <- FALSE
  }
  
  ##### Extract subject ID (part of the umccrise output folder name) and add it to the MySQL insert command. This will overwrite argument passed to "--clinical_id" flag
  subjectID <- unlist(strsplit(tail(unlist(strsplit(params$umccrise, split='/', fixed=TRUE)), n=1), split='__', fixed=TRUE))[1]
  
} else {
  runPcgrChunk <- FALSE
  runPurpleChunk <- FALSE
  runSVsChunk <- FALSE
}

##### Read in OncoKB (http://oncokb.org) annotations
caner_genes_annot.list[["oncokb_clin_vars"]] <- read.table(paste(params$ref_data_dir, params$oncokb_clin_vars, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="")
caner_genes_annot.list[["oncokb_all_vars"]] <- read.table(paste(params$ref_data_dir, params$oncokb_all_vars, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="", fill = TRUE)

##### Read in CIViC (https://civicdb.org/) annotations
caner_genes_annot.list[["civic_var_summaries"]] <- read.table(paste(params$ref_data_dir, params$civic_var_summaries, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="", fill = TRUE)
caner_genes_annot.list[["civic_clin_evid"]] <- read.table(paste(params$ref_data_dir, params$civic_clin_evid, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="", fill = TRUE)

##### Read in Cancer Biomarkers database (https://www.cancergenomeinterpreter.org/biomarkers) annotations. This is mainly used to annotate reported fusion events
caner_genes_annot.list[["cancer_biomarkers_trans"]] <- read.table(paste(params$ref_data_dir, params$cancer_biomarkers_trans, sep="/"), sep="\t", as.is=TRUE, header=TRUE, row.names=NULL, quote="", fill = TRUE)

##### Read in FusionGDB database (https://ccsm.uth.edu/FusionGDB/) used to annotate reported fusion events, with info about head and tail genes.
caner_genes_annot.list[["FusionGDB"]] <- read.table(paste(params$ref_data_dir, params$FusionGDB, sep="/"), sep="\t", as.is=TRUE, header=FALSE, row.names=NULL, quote="", fill = TRUE)
names(caner_genes_annot.list[["FusionGDB"]]) <- c("Hgene", "HgeneID", "Tgene", "TgeneID", "FGname", "FGID")


##### Add refenence cohort name to the sample name
if ( params$dataset_name_incl != "" ) {
  sample_name <- paste0(params$sample_name, "_", params$dataset)
} else {
  sample_name <- params$sample_name
}

##### Read in reference datasets and merge them with sample data. This part outputs a vector with first element containing the merged data and second element containing merged targets info
ref_dataset.list[[dataset]] <- combineDatasets(sample_name=sample_name, sample_counts=counts, ref_data=ref_dataset, report_dir = results_dir, dataset = dataset)
names(ref_dataset.list[[dataset]]) <- c("combined_data", "sample_annot")

##### Define internal, external and addition cancer group names based on the targets definition
int_cancer_group <- ref_dataset$int_ref[3]
ext_cancer_group <- ref_dataset$ext_ref[3]

if ( length(unique(ref_dataset.list[[dataset]][["sample_annot"]]$Target)) > 3 ) {
  
  add_cancer_group <- unique(ref_dataset.list[[dataset]][["sample_annot"]]$Target)[2]
} else {
  add_cancer_group <- NULL
}

##### Define the cancer group to be used to compare per-gene expression values and report in the summary tables
if ( dataset == "PAAD" || dataset == "PAAD-IPMN" || dataset == "PAAD-NET" || dataset == "PAAD-ACC" ) {
  comp_cancer_group <- int_cancer_group
} else {
  comp_cancer_group <- ext_cancer_group
}

##### Clean the space
rm(counts, tx2ensembl)
```

```{r mysql_populate, message=FALSE, warning=FALSE}
##### Initiate MySQL command to populate RNA-seq data portal
mysql_populate <- paste0("### MySQL command to insert data for sample \"", sample_name, "\"\nuse piedb;\nINSERT INTO RNAseq_reports ( ID ,Platform, PatientID, SampleID, Cancer, Source, Project, Report, PMID, Analysis, Summary, Date ) VALUES ( 1000000, \"RNA_seq\"")
mysql_populate_update <- "ON DUPLICATE KEY UPDATE ID=1000000 ,Platform=\"RNA_seq\""

##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ", \"", subjectID, "\", \"", sample_name, "\", \"", params$dataset , "\", \"", params$sample_source , "\", \"", params$project , "\", \"", paste0(sample_name, ".RNAseq_report.html"), "\", \"", sample_name, "\", \""  )
mysql_populate_update <-  paste0(mysql_populate_update, ", PatientID=\"", subjectID, "\", SampleID=\"", sample_name, "\", Cancer=\"", params$dataset , "\", Source=\"", params$sample_source , "\", Project=\"", params$project , "\", Report=\"", paste0(sample_name, ".RNAseq_report.html"),"\", PMID=\"", sample_name, "\", Analysis=\""  )
```

```{r treatment_info, comment = NA, message=FALSE, warning=FALSE, fig.width = 12, fig.height = 5, eval = runClinicalChunk }
##### Prapare data for the treatment timeline plot
##### Search for row with clinical info for investigated patient
if ( !is.na(params$clinical_id) ) {
  sampleID.col <- grep(params$clinical_id, ref_dataset.list[[dataset]][["clinical_info"]])
} else if ( !is.na(params$subject_id) ) {
  sampleID.col <- grep(params$subject_id, ref_dataset.list[[dataset]][["clinical_info"]])
} else if ( !is.na(subjectID) ) {
  sampleID.col <- grep(subjectID, ref_dataset.list[[dataset]][["clinical_info"]])
}

runClinicalChunk <- FALSE

if ( length(sampleID.col) > 0 ) {
  
  ##### Identify column and row with patients details
  if ( !is.na(params$clinical_id) ) {
    sampleID.row <- grep(params$clinical_id, ref_dataset.list[[dataset]][["clinical_info"]][, sampleID.col])
  } else if ( !is.na(params$subject_id) ) {
    sampleID.row <- grep(params$subject_id, ref_dataset.list[[dataset]][["clinical_info"]][, sampleID.col])
  } else if ( !is.na(subjectID) ) {
    sampleID.row <- grep(subjectID, ref_dataset.list[[dataset]][["clinical_info"]][, sampleID.col])
  }
  
  clinical_info <- ref_dataset.list[[dataset]][["clinical_info"]][ sampleID.row, ]
  
  ##### Prepare data frame structure for plotting
  ##### Define treatment types
  treamtent.types <- make.names(c("NEOADJUVANT REGIMEN", "ADJUVANT REGIMEN", "FIRST LINE REGIMEN", "SECOND LINE REGIMEN", "THIRD LINE REGIMEN"))
  treamtent.types_simple <- c("Neoadjuvant", "Adjuvant", "1st line", "2nd line", "3rd line")
  treamtent.df <- data.frame(matrix(ncol = 4, nrow = 0))
  colnames(treamtent.df) <- c("Treatment", "Type", "Start", "End")

  for ( i in 1:length(treamtent.types) ) {
    
    ##### Identify treatment column number
    treamtent.types.col <- grep(paste0("^",treamtent.types[i], "$"), names(clinical_info))
    
    ##### Check how many treatments of particular type were used
    treamtent.types.details <- unlist(strsplit(clinical_info[, treamtent.types.col], split=',', fixed=TRUE))
    
    ##### Add start and end info for each treatment
    if ( any(!is.na(treamtent.types.details ), na.rm = FALSE) ) {
      for ( treatment in treamtent.types.details ) {
        
        treamtent.start <- clinical_info[, treamtent.types.col+1]
        treamtent.end <- clinical_info[, treamtent.types.col+2]

        ##### Use current data if treatment is still ongoing
        today <- as.character(Sys.Date())
        treamtent.end[ is.na(treamtent.end) ] <- today
        treamtent.tmp <- data.frame( treatment, treatment, treamtent.types_simple[i], treamtent.start, treamtent.end)
        treamtent.df <- rbind( treamtent.df, treamtent.tmp)
      }
    }
  }
  
  if ( nrow(treamtent.df) > 0 ) {
    ##### For security reasons (wrt plots that go to PIEdb), change the dates but preserve the duration of the treatments
    ##### Get the earliest treatment date and set it as day 0. Then, create fake start and end dates based on the treatment length
    day0 <- sort(treamtent.df$treamtent.start, decreasing = FALSE)[1]
    treamtents.length <- treamtent.df$treamtent.end - treamtent.df$treamtent.start
    treamtents.reset <- as.Date("2000-01-01") - day0
    treamtent.df$treamtent.start <- treamtent.df$treamtent.start + treamtents.reset
    treamtent.df$treamtent.end <- treamtent.df$treamtent.start + treamtents.length
    names(treamtent.df) <- c("Treatment", "Drug", "Type", "Start",  "End")
    
    ##### Create directory for timeline plot
    PlotsDir <- paste(results_dir, "clinical_info", sep = "/")
    if ( !file.exists(PlotsDir) ) {
      dir.create(PlotsDir, recursive=TRUE)
    }
        
    ##### Record the timeline plot. NOTE, the modified dates are used here
    treatment_timeline <- lares::plot_timeline(event = treamtent.df$Treatment, start = treamtent.df$Start, end = treamtent.df$End, label = NA, group = treamtent.df$Type, title = "", subtitle = "", save = FALSE)
    
    ##### Save the plot into png file. NOTE, the modified dates are used here. As default, the plot is saved as "cv_timeline"
    lares::plot_timeline(event = treamtent.df$Treatment, start = treamtent.df$Start, end = treamtent.df$End, label = NA, group = treamtent.df$Type, title = "", subtitle = "", save = TRUE, subdir = "clinical_info")
    
    #### Clear plots to free up some memory
    if(!is.null(dev.list())) invisible(dev.off())
    
    cv_timeline.png <- readPNG("clinical_info/cv_timeline.png", native = FALSE, info = FALSE)
    
    ##### Change the size of the timeline png plot and save it as "treatment_timeline.png"
    png::writePNG(cv_timeline.png, paste(PlotsDir, "treatment_timeline.png", sep="/"), dpi=300)
    #png(paste(PlotsDir, "treatment_timeline.png", sep="/"), width = 900, height = 600, pointsize = 0.0001, res=300)
    #plot(cv_timeline.png)
    #invisible(dev.off())
    
    #### Clear plots to free up some memory
    if(!is.null(dev.list())) invisible(dev.off())
    
    ##### Remove the original plot folder
    system("rm -rf clinical_info", ignore.stdout = TRUE, ignore.stderr = TRUE)
    
    runClinicalChunk <- TRUE
  }

##### Clean the space
rm(list = ls(pattern='^treamtent.*'))
rm(clinical_info, cv_timeline.png)
}
```

```{r cancer_genes_prep, comment = NA, message=FALSE, warning=FALSE}
##### Combine UMCCR cancer gene list (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv) with OncoKB cancer genes
genes_cancer <- ref_genes.list[["genes_oncokb"]]
genes_cancer$UMCCR <- rep("No", nrow(genes_cancer))
genes_cancer$Oncogene <- rep("-", nrow(genes_cancer))
genes_cancer$TSG <- rep("-", nrow(genes_cancer))
genes_cancer$Fusion <- rep("-", nrow(genes_cancer))
genes_cancer$Germline <- rep("-", nrow(genes_cancer))

##### Flag Oncogenes, TSGs and fusion genes in the UMCCR cancer genes list (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
ref_genes.list[["genes_cancer"]]$germ <- gsub("TRUE", "Yes", ref_genes.list[["genes_cancer"]]$germ)
ref_genes.list[["genes_cancer"]]$germ <- gsub("FALSE", "-", ref_genes.list[["genes_cancer"]]$germ)
ref_genes.list[["genes_cancer"]]$fusion <- gsub("TRUE", "Yes", ref_genes.list[["genes_cancer"]]$fusion)
ref_genes.list[["genes_cancer"]]$fusion <- gsub("FALSE", "-", ref_genes.list[["genes_cancer"]]$fusion)
ref_genes.list[["genes_cancer"]]$tumorsuppressor <- gsub("TRUE", "Yes", ref_genes.list[["genes_cancer"]]$tumorsuppressor)
ref_genes.list[["genes_cancer"]]$tumorsuppressor <- gsub("FALSE", "-", ref_genes.list[["genes_cancer"]]$tumorsuppressor)
ref_genes.list[["genes_cancer"]]$oncogene <- gsub("TRUE", "Yes", ref_genes.list[["genes_cancer"]]$oncogene)
ref_genes.list[["genes_cancer"]]$oncogene <- gsub("FALSE", "-", ref_genes.list[["genes_cancer"]]$oncogene)

for ( gene in unlist(ref_genes.list[["genes_cancer"]]$symbol ) ) {
  ##### Check if the UMCCR genes is already reported in OncoKB
  if ( gene %in% genes_cancer$Hugo.Symbol ) {
   
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$UMCCR <- "Yes"
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Oncogene <- ref_genes.list[["genes_cancer"]]$oncogene[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$TSG <- ref_genes.list[["genes_cancer"]]$tumorsuppressor[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Fusion <- ref_genes.list[["genes_cancer"]]$fusion[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Germline <- ref_genes.list[["genes_cancer"]]$germ[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, 2] <- as.numeric(genes_cancer[ genes_cancer$Hugo.Symbol==gene, 2]) + 1
    
  ##### Add if not present
  } else {
    genes_cancer <- rbind(genes_cancer, c(gene, 1, "No", rep("", 8), "Yes"))
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Oncogene <- ref_genes.list[["genes_cancer"]]$oncogene[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$TSG <- ref_genes.list[["genes_cancer"]]$tumorsuppressor[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Fusion <- ref_genes.list[["genes_cancer"]]$fusion[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
    genes_cancer[ genes_cancer$Hugo.Symbol==gene, ]$Germline <- ref_genes.list[["genes_cancer"]]$germ[ref_genes.list[[ "genes_cancer"]]$symbol==gene]
  }
}

##### Make the data frame to look nicer
rownames(genes_cancer) <- genes_cancer$Hugo.Symbol
names(genes_cancer) <- c("Gene", "Gene panels no.", "OncoKB", "Oncogene (OncoKB)", "TSG (OncoKB)", "MSK-IMPACT", "MSK-HEME", "Foundation One", "Foundation One Heme", "Vogelstein", "Sanger CGC", "UMCCR", "Oncogene", "TSG", "Fusion", "Germline")
genes_cancer <- genes_cancer[,c("Oncogene", "TSG", "Fusion", "Germline", "Gene panels no.", "UMCCR", "OncoKB", "MSK-IMPACT", "MSK-HEME", "Foundation One", "Foundation One Heme", "Vogelstein", "Sanger CGC")]
genes_cancer[ genes_cancer=="No" ] <- "-"
genes_cancer[ genes_cancer=="" ] <- "-"

ref_genes.list[["genes_cancer"]] <- genes_cancer
ref_genes.list[["genes_oncokb"]] <- genes_cancer[ rownames(genes_cancer) %in% ref_genes.list[["genes_oncokb"]]$Hugo.Symbol, ]

##### Clean the space
rm(genes_cancer)
```

```{r goi_summary, comment = NA, message=FALSE, warning=FALSE}
##### Record all genes of interest to make sure that these are not filtered out during read counts data processing
# PCGR annotation of mutated genes in given patient based on PCGR report, including only those with variants classified according to user-defined tier
if ( runPcgrChunk ) {
  ref_genes.list[["summary"]]$Mutated <- unique(ref_genes.list[["pcgr"]][ ref_genes.list[["pcgr"]]$TIER %in% c(1:params$pcgr_tier), ]$SYMBOL)
  
  ##### Include splice region variants
  if ( params$pcgr_splice_vars ) {
    ref_genes.list[["summary"]]$Mutated <- unique( c(ref_genes.list[["summary"]]$Mutated,  ref_genes.list[["pcgr"]][ grepl("NONCODING.*splice region", paste0(ref_genes.list[["pcgr"]]$TIER, ".", ref_genes.list[["pcgr"]]$CONSEQUENCE), fixed = FALSE), ]$SYMBOL) )
  }
  
  ##### Remove NAs
  if ( length(ref_genes.list[["summary"]]$Mutated) > 0 ) {
    ref_genes.list[["summary"]]$Mutated <- ref_genes.list[["summary"]]$Mutated[ !(is.na(ref_genes.list[["summary"]]$Mutated)) ]
  } else {
    ref_genes.list[["summary"]]$Mutated <- NULL
  }
}
    
# ARRIBA and PIZZLY annotation of gene fusion events detected in given patient based on PIZZLY results
if ( runFusionChunk ) {
  
  if ( runArribaChunk ) {
    ref_genes.list[["summary"]]$Fusion <- unique(c(as.character(ref_genes.list[["arriba"]]$X.gene1), as.character(ref_genes.list[["arriba"]]$gene2)))
  } else {
    ref_genes.list[["summary"]]$Fusion <- NULL
  }
  
  if ( runPizzlyChunk ) {
    ref_genes.list[["summary"]]$Fusion <- unique(c(ref_genes.list[["summary"]]$Fusion, as.character(ref_genes.list[["pizzly"]]$geneA.name), as.character(ref_genes.list[["pizzly"]]$geneB.name)))
  }
  
  if ( runDragenFusionChunk ) {
    ref_genes.list[["summary"]]$Fusion <- unique(c(ref_genes.list[["summary"]]$Fusion, as.character(ref_genes.list[["dragenFusion"]]$gene1), as.character(ref_genes.list[["dragenFusion"]]$gene2)))
  }
  
  ##### Remove NAs
  if ( length(ref_genes.list[["summary"]]$Mutated) > 0 ) {
    ref_genes.list[["summary"]]$Fusion <- ref_genes.list[["summary"]]$Fusion[ !(is.na(ref_genes.list[["summary"]]$Fusion)) ]
  } else {
    ref_genes.list[["summary"]]$Fusion <- NULL
  }
}

# MANTA annotation of structural variants (SVs) with affected genes in given patient based on MANTA results
if ( runSVsChunk ) {
  ref_genes.list[["summary"]]$SV <- ref_genes.list[["manta"]]
  ref_genes.list[["summary"]]$SV <- ref_genes.list[["summary"]]$SV[ ref_genes.list[["summary"]]$SV$Gene != "",  ]$Gene
  # ...and distinguish classified by MANTA as fusion genes
  
  ##### Remove NAs
  if ( length(ref_genes.list[["summary"]]$SV) > 0 ) {
    ref_genes.list[["summary"]]$SV <- unique(unlist(strsplit(ref_genes.list[["summary"]]$SV, split='&', fixed=TRUE)))
    ref_genes.list[["summary"]]$SV <- ref_genes.list[["summary"]]$SV[ !(is.na(ref_genes.list[["summary"]]$SV)) ]
  } else {
    ref_genes.list[["summary"]]$SV <- NULL
  }
}

# PURPLE annotation of copy-number (CN) altered genes in given patient based on PURPLE results, including only those with CN values meeting user-defined thresholds
if ( runPurpleChunk ) {
  ref_genes.list[["summary"]]$CN <- ref_genes.list[["purple"]]
  ref_genes.list[["summary"]]$CN <- ref_genes.list[["summary"]]$CN[ ref_genes.list[["summary"]]$CN %!in% "",  ]
  
  ##### Get the CN mean
  ref_genes.list[["summary"]]$CN$MeanCopyNumber <- rowMeans(cbind(ref_genes.list[["summary"]]$CN$MinCopyNumber, ref_genes.list[["summary"]]$CN$MaxCopyNumber))
    
  ##### Deal with negative CN values
  ref_genes.list[["summary"]]$CN$MeanCopyNumber[ ref_genes.list[["summary"]]$CN$MeanCopyNumber < 0 ] <- 0

  ##### Limit the data to include only cancer genes
  ref_genes.list[["summary"]]$CN <- ref_genes.list[["summary"]]$CN[ ref_genes.list[["summary"]]$CN$Gene %in% rownames(ref_genes.list[["genes_cancer"]]), ]

  ##### Keep only altered genes with CN values below loss threshold (default 5th percentile) and above gain threshold (default 95th percentile)
  if ( params$cn_loss == 5 && params$cn_gain == 95 ) {
    cn_data.all.percent <- quantile(ref_genes.list[["summary"]]$CN$MeanCopyNumber, probs = seq(0, 1, .05), na.rm = TRUE)
    cn_bottom <- round(cn_data.all.percent[2], digits = 2)
    cn_top <- round(cn_data.all.percent[20], digits = 2)
  
  } else {
    cn_bottom <- params$cn_loss
    cn_top <- params$cn_gain
  }
  
  ##### If the difference is 0 then increase/decrease threshold by 1
  if  ( abs(cn_top-cn_bottom) == 0 ) {
    cn_top <- cn_top + 1
    cn_bottom <- cn_bottom - 1
  }
  
  ref_genes.list[["summary"]]$CN <- unique(ref_genes.list[["summary"]]$CN[ ref_genes.list[["summary"]]$CN$MeanCopyNumber <= cn_bottom | ref_genes.list[["summary"]]$CN$MeanCopyNumber >= cn_top, ]$Gene)
  
  ##### Remove NAs
  if ( length(ref_genes.list[["summary"]]$CN) > 0 ) {
    ref_genes.list[["summary"]]$CN <- ref_genes.list[["summary"]]$CN[ !(is.na(ref_genes.list[["summary"]]$CN)) ]
  } else {
    ref_genes.list[["summary"]]$CN <- NULL
  }
}

# Immune reponse markers
ref_genes.list[["summary"]]$Immune <- unique(ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL)

if ( params$immunogram ) {
  ref_genes.list[["summary"]]$Immune <- unique(c(ref_genes.list[["summary"]]$Immune, ref_genes.list[["genes_immune"]]$immunogram$SYMBOL))
  
  ##### Remove NAs
  ref_genes.list[["summary"]]$Immune <- ref_genes.list[["summary"]]$Immune[ !(is.na(ref_genes.list[["summary"]]$Immune)) ]
}

# HRD (homologous recombination deficiency) genes
ref_genes.list[["summary"]]$HRD <- unique(ref_genes.list[["genes_hrd"]]$SYMBOL)

##### Remove NAs
ref_genes.list[["summary"]]$HRD <- ref_genes.list[["summary"]]$HRD[ !(is.na(ref_genes.list[["summary"]]$HRD)) ]
  
# Cancer genes derived from UMCCR Cancer Gene list (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv) and from OncoKB portal (http://oncokb.org/#/cancerGenes)
ref_genes.list[["summary"]]$Cancer <- rownames(ref_genes.list[["genes_cancer"]])

##### Remove NAs
ref_genes.list[["summary"]]$Cancer <- ref_genes.list[["summary"]]$Cancer[ !(is.na(ref_genes.list[["summary"]]$Cancer)) ]

##### Record all genes of interest
genes2keep <- unique( unlist(ref_genes.list[["summary"]]) )
```

```{r goi_annotation, comment = NA, message=FALSE, warning=FALSE}
##### Get gene symbols for the genes of interest. These genes will not be filtered out due to low/insufficient expression
##### Get genes annotation and genomic locations
edb <- eval(parse(text = paste0("EnsDb.Hsapiens.v", params$ensembl_version)))
  
##### Get keytypes for gene SYMBOL
keys <- keys(edb, keytype="GENEID")
  
##### Get genes genomic coordiantes
gene_info <- ensembldb::select(edb, keys=keys, columns=c("GENEID", "GENENAME"), keytype="GENEID")
names(gene_info) <- gsub("GENEID", "ENSEMBL", names(gene_info))
names(gene_info) <- gsub("GENENAME", "SYMBOL", names(gene_info))
  
##### Limit genes annotation to the gene of interest
genes2keep <- gene_info[ gene_info$SYMBOL %in% genes2keep,  ]
  
##### Remove rows with duplicated ENSEMBL IDs
genes2keep = genes2keep[!duplicated(genes2keep$ENSEMBL),]
rownames(genes2keep) <- genes2keep$ENSEMBL

##### Remove rows with duplicated gene symbols (Y_RNAs, SNORs, LINC0s etc). Preferably select ENSEMBL ID that is used in the count data
genes2keep.combined_data <- genes2keep[ genes2keep$ENSEMBL %in% rownames(ref_dataset.list[[dataset]]$combined_data), ]
genes2keep <- genes2keep[ genes2keep$SYMBOL %!in% genes2keep.combined_data$SYMBOL, ]
genes2keep <-  genes2keep[!duplicated(genes2keep$SYMBOL),]
genes2keep <- rbind(genes2keep.combined_data, genes2keep)

##### Add column to store info about filtered genes
genes2keep$EXP <- TRUE

##### Clean the space
rm(edb, keys, gene_info)
```

```{r library_size_plot, message = FALSE, warning = FALSE, echo = TRUE, fig.width = 12, fig.height = 9}
suppressMessages(library(plotly))
##### Generate bar-plot for library size. The colours indicate sample groups, as provided in *Target* column in the sample annotation file

data <- ref_dataset.list[[dataset]][["combined_data"]]
target <- ref_dataset.list[[dataset]][["sample_annot"]]
target$Target[ target$Target==sample_name ] <- "Patient"
rownames(target)[ rownames(target)==sample_name ] <- "Patient"

##### Change the datasets levels order
target$Target <- factor(target$Target, levels = unique(target$Target))

##### Assigne colours to targets and datasets
targets.colour <- getColours(target$Target)

##### Prepare data frame
data.df <- data.frame(rownames(target), as.numeric(colSums(data)*1e-6), target$Target)
colnames(data.df) <- c("Sample", "Library_size", "Target")

##### The default order will be alphabetized unless specified as below
data.df$Sample <- factor(data.df$Sample, levels = data.df[["Sample"]])

library_size <- plot_ly(data.df, x = ~Sample, y = ~Library_size, color = ~Target, colors = targets.colour[[1]], type = 'bar', width = 800, height = 400) %>%
  layout(title = "", xaxis = list( tickfont = list(size = 10), title = "", showticklabels = FALSE), yaxis = list(title = "Library size (millions)"), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F, showlegend=TRUE, legend = list(orientation = 'h', y = max(data.df$Library_size), bgcolor = "white"))

##### Create directory for input data plots
PlotsDir <- paste(results_dir, "InputDataPlots", sep = "/")
if ( !file.exists(PlotsDir) ) {
  dir.create(PlotsDir, recursive=TRUE)
}

##### Save interactive plot as html file
saveWidgetFix(library_size, file = paste(PlotsDir, "library_size.html", sep = "/"))
  
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)
```

```{r data_transformation_filtering, comment = NA, message=FALSE, warning=FALSE}
##### Filtering to remove low expressed genes. For differential expression and related analyses, gene expression is rarely considered at the level of raw counts since libraries sequenced at a greater depth will result in higher counts. Rather, it is common practice to transform raw counts onto a scale that accounts for such library size differences. Genes with very low counts across all libraries provide little evidence for differential expression. In the biological point of view, a gene must be expressed at some minimal level before it is likely to be translated into a protein or to be biologically important. In addition, the pronounced discretenes of these counts interferes with some of the statistical approximations that are used later in the pipeline. These genes should be filtered out prior to further analysis. Users should filter with CPM rather than filtering on the counts directly, as the latter does not account for differences in library sizes between samples. For instance for the CPM-transformed data we keep only genes that have CPM of 1

##### Transformation to CPM or TPM scale (see these blogs for details https://www.rna-seqblog.com/rpkm-fpkm-and-tpm-clearly-explained/ and https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/ ).  CPM = Counts Per Million,  TPM = Transcripts Per Kilobase Million. 

##### For counts data processing consider the investigated sample and internal reference cohort as one group  (regardless of the investigated patient tissie origin), and TCGA data (of any cancer type) as another group. This is to facilitate batch-effects (related with technical aspects) correction process
target_mod <- ref_dataset.list[[dataset]][["sample_annot"]]
target_mod$Dataset <- gsub(sample_name, int_cancer_group, target_mod$Dataset)
targets_mod.list <- unique(target_mod$Dataset)

##### Create lists with processed data each group
y <- vector("list", length(targets_mod.list))
names(y) <- targets_mod.list

##### Keep info about samples with the lowest and greates counts for defined CPM threshold
cpm.min <- round(min(as.numeric(colSums(ref_dataset.list[[dataset]][["combined_data"]])*1e-6)), digits=0)
cpm.max <- round(max(as.numeric(colSums(ref_dataset.list[[dataset]][["combined_data"]])*1e-6)), digits=0)

#### For each group...
for ( group in targets_mod.list ) {
    target <- target_mod[ target_mod$Dataset==group, ]
    data <- ref_dataset.list[[dataset]][["combined_data"]]
    data <- data[ , target_mod$Dataset==group]
    
  ##### CPM transformation and filtering
  if ( params$filter && params$transform == "CPM" ) {
    
    ##### Create EdgeR DGEList object
    y[[group]] <- edgeR::DGEList(counts=data,  group=target$Dataset)
    
    ##### Keep genes with CPM of at least 1 in more than 10% of samples
    filter.threshold <- 1
    keep <- rowSums(edgeR::cpm(y[[group]])>filter.threshold) >= ncol(data)/10
    
    ##### Note which genes of interest are not expressed
    genes2keep$EXP[ rownames(genes2keep) %!in% names(keep) ] <- FALSE
    
    ##### Keep the genes of interest too
    keep[ names(keep) %in% rownames(genes2keep) ] <- TRUE
    y[[group]]$filtered <- y[[group]][keep, , keep.lib.sizes=FALSE]
    
    ##### Transform the raw-scale to CPM. Add small offset to each observation to avoid taking log of zero
    y[[group]]$transformed <- edgeR::cpm(y[[group]], normalized.lib.sizes=FALSE, log=params$log, prior.count=0.25)
    y[[group]]$filtered.transformed <- edgeR::cpm(y[[group]]$filtered, normalized.lib.sizes=FALSE, log=params$log, prior.count=0.25)
  
  ##### CPM transformation without filtering
  } else if ( !params$filter && params$transform == "CPM" ) {
    ##### Create EdgeR DGEList object
    y[[group]] <- edgeR::DGEList(counts=data,  group=target$Dataset)
    
    ##### Transform the raw-scale to CPM. Add small offset to each observation to avoid taking log of zero
    y[[group]]$transformed <- edgeR::cpm(y[[group]], normalized.lib.sizes=FALSE, log=params$log, prior.count=0.25)
    
  ##### TPM data transformation. We can convert RPKM to TPM in two different ways: from pre-calculated RPKM, by diving by the sum of RPKM values, or directly from the normalized counts. Here we calculate TPM starting from RPKM values computed using edgeR's rpkm function ( from http://luisvalesilva.com/datasimple/rna-seq_units.html )
  ##### TPM transformation with filtering
  } else if ( params$filter && params$transform == "TPM" ) {
    
    ##### Get genes lengths
    edb <- eval(parse(text = paste0("EnsDb.Hsapiens.v", params$ensembl_version)))
    gene.length <- lengthOf(edb, filter = GeneIdFilter(rownames(data)))
    
    ##### Check for which genes the lenght info is not available and remove them from the data
    genes.no_length <- rownames(data)[ rownames(data) %!in% names(gene.length)]
    data <- data[ rownames(data) %!in% genes.no_length, ]
    
    ##### Create EdgeR DGEList object
    y[[group]] <- edgeR::DGEList(counts=data,  group=target$Dataset)
    
    ##### Convert data into RPKM
    y[[group]]$transformed <- edgeR::rpkm(y[[group]], gene.length = gene.length, normalized.lib.sizes=FALSE, log=FALSE)
    
    ##### ... and then to TPM scale. Add small offset to each observation to avoid taking log of zero
    if ( params$log ) {
      y[[group]]$transformed <- log2(tpm_from_rpkm(y[[group]]$transformed+0.25))
      
      ##### Keep genes with TPM of at least 1 in more than 10% of samples
      filter.threshold <- 1+0.25
      keep <- rowSums(y[[group]]$transformed > filter.threshold) >= ncol(y[[group]]$transformed)/10
      
      ##### Note which genes of interest are not expressed
      genes2keep$EXP[ rownames(genes2keep) %!in% names(keep) ] <- FALSE
    
      ##### Keep the genes of interest too
      keep[ names(keep) %in% rownames(genes2keep) ] <- TRUE
      y[[group]]$filtered <- y[[group]]$counts[keep, ]
      y[[group]]$filtered.transformed <- y[[group]]$transformed[keep, ]
   
    } else {
      y[[group]]$transformed <- tpm_from_rpkm(y[[group]]$transformed)
      
      ##### Keep genes with TPM of at least 1 in more than 10% of samples
      filter.threshold <- 1
      keep <- rowSums(y[[group]]$transformed > filter.threshold) >= ncol(y[[group]]$transformed)/10
      
      ##### Note which genes of interest are not expressed
      genes2keep$EXP[ rownames(genes2keep) %!in% names(keep) ] <- FALSE
    
      ##### Keep the genes of interest too
      keep[ names(keep) %in% rownames(genes2keep) ] <- TRUE
      y[[group]]$filtered <- y[[group]]$counts[keep, ]
      y[[group]]$filtered.transformed <- y[[group]]$transformed[keep, ]
    }
  
  ##### TPM transformation without filtering
  } else if ( !params$filter && params$transform == "TPM" ) {
    
    ##### Get genes lengths
    edb <- eval(parse(text = paste0("EnsDb.Hsapiens.v", params$ensembl_version)))
    gene.length <- lengthOf(edb, filter = GeneIdFilter(rownames(data)))
    
    ##### Check for which genes the lenght info is not available and remove them from the data
    genes.no_length <- rownames(data)[ rownames(data) %!in% names(gene.length)]
    data <- data[ rownames(data) %!in% genes.no_length, ]
    
    ##### Create EdgeR DGEList object
    y[[group]] <- edgeR::DGEList(counts=data,  group=target$Dataset)
    
    ##### Convert data into RPKM
    y[[group]]$transformed <- edgeR::rpkm(y[[group]], gene.length = gene.length, normalized.lib.sizes=FALSE, log=FALSE)
    
    ##### ... and then to TPM scale. Add small offset to each observation to avoid taking log of zero
    if ( params$log ) {
      y[[group]]$transformed <- log2(tpm_from_rpkm(y[[group]]$transformed+0.25))
    } else {
      y[[group]]$transformed <- tpm_from_rpkm(y[[group]]$transformed)
    }
  }
}

##### Now combine DGEList objects created for each group
y[["comb"]]$transformed <- cbind(y[[targets_mod.list[1]]]$transformed, y[[targets_mod.list[2]]]$transformed)
y[["comb"]]$samples <- rbind(y[[targets_mod.list[1]]]$samples, y[[targets_mod.list[2]]]$samples)

if ( params$filter ) {
  
  ##### Keep only genes present in all sets
  genes_mod <- intersect(rownames(y[[targets_mod.list[1]]]$filtered), rownames(y[[targets_mod.list[2]]]$filtered))
  y[[targets_mod.list[1]]]$filtered <- y[[targets_mod.list[1]]]$filtered[ rownames(y[[targets_mod.list[1]]]$filtered) %in% genes_mod, ]
  y[[targets_mod.list[2]]]$filtered <- y[[targets_mod.list[2]]]$filtered[ rownames(y[[targets_mod.list[2]]]$filtered) %in% genes_mod, ]
  y[[targets_mod.list[1]]]$filtered.transformed <- y[[targets_mod.list[1]]]$filtered.transformed[ rownames(y[[targets_mod.list[1]]]$filtered.transformed) %in% genes_mod, ]
  y[[targets_mod.list[2]]]$filtered.transformed <- y[[targets_mod.list[2]]]$filtered.transformed[ rownames(y[[targets_mod.list[2]]]$filtered.transformed) %in% genes_mod, ]
 
  y[["comb"]]$filtered <- cbind(y[[targets_mod.list[1]]]$filtered, y[[targets_mod.list[2]]]$filtered)
  y[["comb"]]$filtered.transformed <- cbind(y[[targets_mod.list[1]]]$filtered.transformed, y[[targets_mod.list[2]]]$filtered.transformed)
}

##### Clean the space
rm(target, target_mod, genes_mod, keep)
```

```{r data_transformation_plot, comment = NA, message=FALSE, warning=FALSE, fig.width = 12, fig.height = 6, fig.show="hide"}
##### Assign colours to targets and datasets
target <- ref_dataset.list[[dataset]][["sample_annot"]]
targets.colour <- getColours(target$Target)
  
##### Collect the most extreme density values for set the x-axis and y-axis boundaries
den.x <- density(y[["comb"]]$transformed[,1])$x
den.y <- density(y[["comb"]]$transformed[,1])$y
  
for (i in 2:ncol(y[["comb"]]$transformed)) {
  den <- density(y[["comb"]]$transformed[,i])
  den.x <- sort(c(den.x, den$x))
  den.y <- sort(c(den.y, den$y))
}

##### Plot read counts against transformed data
if ( params$filter ) {
  suppressMessages(library(plotly))
  
  ##### Organise the data into data frame
  if ( params$log ) {
    data.df <- as.data.frame(cbind( exp(y[["comb"]]$transformed[,ncol(y[["comb"]]$transformed)]), ref_dataset.list[[dataset]][["combined_data"]][,ncol(ref_dataset.list[[dataset]][["combined_data"]])]))
    names(data.df) <- c("Transformed", "Counts")
    data.df$Transformed <- log(data.df$Transformed)
    
  } else {
     data.df <- as.data.frame(cbind( y[["comb"]]$transformed[,ncol(y[["comb"]]$transformed)], ref_dataset.list[[dataset]][["combined_data"]][,ncol(ref_dataset.list[[dataset]][["combined_data"]])]))
    names(data.df) <- c("Transformed", "Counts")
  }
  
  ##### Keep only genes with read counts below the 99th percentile
  data.df <- data.df[ data.df$Counts < quantile(data.df$Counts, 0.99), ]
  
  ##### Keep only every 25th genes to reduce the size of the plot
  data.df <- data.df[ seq(1,nrow(data.df), by=25), ]
  
  ##### Generate plot for filtered data
  counts_vs_transformed <- plot_ly( data.df, x = ~Transformed, y = ~Counts, width = 800, height = 300, color = I('black'), marker = list(size = 5), type="scatter", mode = "markers", name = paste0(params$transform, " / Counts (Patient)") ) %>% 
    add_trace(x = c(filter.threshold, filter.threshold), y= c(0, max(data.df$Counts)), mode = "lines", color = I("red"), name = "Filtering threshold") %>%
    
    layout(title = "", xaxis = list(title = paste0(params$transform, "s")), yaxis = list(title = "Counts"), showlegend=TRUE)
  
  ##### Save interactive plot as html file
  saveWidgetFix(counts_vs_transformed, file = paste(PlotsDir, "counts_vs_transformed.html", sep = "/"))

  ##### Detach plotly package. Otherwise it clashes with other graphics devices
  detach("package:plotly", unload=FALSE)
  
  if ( !is.null(add_cancer_group) ) {
    legend <- c(ext_cancer_group, add_cancer_group, int_cancer_group, "Patient")
  } else {
    legend <- c(ext_cancer_group, int_cancer_group, "Patient")
  }
  
  ##### Before filtering
  par(mfrow=c(1,2))
  plot(density(y[["comb"]]$transformed[,1]), lwd=2, xlim=c(den.x[1],max(data.df$Transformed)), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed data (unfiltered)", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$transformed)){
    den <- density(y[["comb"]]$transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], bty="n", bg = "transparent")
  
  data_transformation_nonfiltered <- recordPlot()
  
  ##### After filtering
  plot(density(y[["comb"]]$filtered.transformed[,1]), lwd=2, xlim=c(den.x[1],max(data.df$Transformed)), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed and filtered data", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$filtered.transformed)){
    den <- density(y[["comb"]]$filtered.transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], bty="n", bg = "transparent")
  
  data_transformation_filtered <- recordPlot()
  
  ##### Save the plot as png file
  png(paste0(PlotsDir, "/filtering.png"), width=900, height=400, pointsize = 14)
  par(mfrow=c(1,2))
  
  ##### Before filtering
  plot(density(y[["comb"]]$transformed[,1]), lwd=2, xlim=c(den.x[1],den.x[length(den.x)]), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed data (unfiltered)", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$transformed)){
    den <- density(y[["comb"]]$transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], cex = 0.7, bty="n", bg = "transparent")
  
  ##### After filtering
  plot(density(y[["comb"]]$filtered.transformed[,1]), lwd=2, xlim=c(den.x[1],den.x[length(den.x)]), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed and filtered data", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$filtered.transformed)){
    den <- density(y[["comb"]]$filtered.transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], cex = 0.7, bty="n", bg = "transparent")
  invisible(dev.off())
  
##### Without filtering
} else {
  plot(density(y[["comb"]]$transformed[,1]), lwd=2, xlim=c(den.x[1],den.x[length(den.x)]), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed data (unfiltered)", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$transformed)){
    den <- density(y[["comb"]]$transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], bty="n", bg = "transparent")
  
  data_transformation_nonfiltered <- recordPlot()
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
  
  ##### Save the plot as png file
  png(paste0(PlotsDir, "/filtering.png"), width=900, height=400, pointsize = 14)
  plot(density(y[["comb"]]$transformed[,1]), lwd=2, xlim=c(den.x[1],den.x[length(den.x)]), ylim=c(den.y[1],den.y[length(den.y)]), las=2, main="", xlab="", col=targets.colour[[2]][1])
  title(main="Transformed data (unfiltered)", xlab=params$transform)
  abline(v=0, lty=3)
  
  for (i in 2:ncol(y[["comb"]]$transformed)){
    den <- density(y[["comb"]]$transformed[,i])
    lines(den$x, den$y, lwd=2, col=targets.colour[[2]][i])
  }
  legend("topright", legend=legend, fill=targets.colour[[1]], cex = 0.7, bty="n", bg = "transparent")
  invisible(dev.off())
}

##### Clean the space
rm(data, data.df, target, den.x, den.y)
```

```{r data_normalisation, comment = NA, message=FALSE, warning=FALSE }
##### During the sample preparation or sequencing process, external factors that are not of biological interest can affect the expression of individual samples. For example, samples processed in the first batch of an experiment can have higher expression overall when compared to samples processed in a second batch. It is assumed that all samples should have a similar range and distribution of expression values. Normalisation for sample-specific effects is required to ensure that the expression distributions of each sample are similar across the entire experiment.

##### TMM normalsation. Trimmed mean of M-values (https://www.ncbi.nlm.nih.gov/pubmed/20196867) (TMM) is performed using the calcNormFactors function in edgeR. The normalisation factors calculated here are used as a scaling factor for the library sizes. TMM is the recommended for most RNA-Seq data where the majority (more than half) of the genes are believed not differentially expressed between any pair of the samples. It adjusts for RNA composition effect, calculates scaling factors for the library sizes with calcNormFactors function using trimmed mean of M-values (TMM) between each pair of samples. Note, that the raw read counts are used to calculate the normalisation factors
  
#### For each group...
for ( group in targets_mod.list ) {
  if ( params$transform == "CPM" ) {
    
    ##### Calculate normalization factors and transformations from the raw-scale to CPM and normalisation using user-defined method
    if ( params$filter ) {
      y[[group]]$noNorm <- y[[group]]$filtered.transformed
      y[[group]]$filtered$samples["norm.factors"] <- edgeR::calcNormFactors(y[[group]]$filtered, method = params$norm)$samples["norm.factors"]
      y[[group]]$norm <- edgeR::cpm(y[[group]]$filtered, normalized.lib.sizes=TRUE, log=params$log, prior.count=0.25)
    
    } else {
      y[[group]]$noNorm <- y[[group]]$transformed
      y[[group]]$samples["norm.factors"] <- edgeR::calcNormFactors(y[[group]], method = params$norm)$samples["norm.factors"]
      y[[group]]$norm <- edgeR::cpm(y[[group]], normalized.lib.sizes=TRUE, log=params$log, prior.count=0.25)
    }
    
  ##### Quantile normalsation (from https://www.biostars.org/p/296992/ )
  } else if ( params$transform == "TPM" ) {
    
    ##### Normalisation using quantile method
    if ( params$filter ) {
      y[[group]]$noNorm <- y[[group]]$filtered.transformed
      y[[group]]$filtered.transformed <- data.matrix(y[[group]]$filtered.transformed) 
      
      if ( tolower(params$norm) != "none" ) {
        y[[group]]$norm  <- normalize.quantiles(y[[group]]$filtered.transformed, copy = TRUE)
        colnames(y[[group]]$norm) <- colnames(y[[group]]$filtered.transformed)
        rownames(y[[group]]$norm) <- rownames(y[[group]]$filtered.transformed)
      } else {
        y[[group]]$norm  <- y[[group]]$filtered.transformed
      }
    } else {
      y[[group]]$noNorm <- y[[group]]$transformed
      y[[group]]$transformed <- data.matrix(y[[group]]$transformed)
      
      if ( tolower(params$norm) != "none" ) {
        y[[group]]$norm  <- normalize.quantiles(y[[group]]$transformed, copy = TRUE)
        colnames(y[[group]]$norm) <- colnames(y[[group]]$transformed)
        rownames(y[[group]]$norm) <- rownames(y[[group]]$transformed)
      } else {
        y[[group]]$norm  <- y[[group]]$transformed
      }
    }
  }
}  

##### Combine DGEList objects created for each group
y[["comb"]]$noNorm <- cbind(y[[targets_mod.list[1]]]$noNorm, y[[targets_mod.list[2]]]$noNorm)
y[["comb"]]$norm <- cbind(y[[targets_mod.list[1]]]$norm, y[[targets_mod.list[2]]]$norm)

if ( tolower(params$norm) != "none" ) {
  ref_dataset.list[[dataset]][["combined_data_processed"]] <- y[["comb"]]$norm
} else {
  ref_dataset.list[[dataset]][["combined_data_processed"]] <- y[["comb"]]$noNorm
}

##### Clean the space
rm(targets_mod.list)
```

```{r data_normalisation_plot, comment = NA, message=FALSE, warning=FALSE, fig.width = 12, fig.height = 6, fig.show="hide"}
##### Plot expression distribution of samples for unnormalised and normalised data
par(mfrow=c(2,1), mar=c(2, 5, 3, 2))

##### Unnormalised data
boxplot(y[["comb"]]$noNorm, las=2, col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
title(main="Unnormalised data", ylab=params$transform)
legend("topright", legend=legend, fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")

data_nonnormalised <- recordPlot()

##### Normalised data
boxplot(y[["comb"]]$norm, las=2, col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
title(main=paste0("Normalised data (", params$norm, ")"), ylab=params$transform)
legend("topright", legend=legend, fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")

data_normalised <- recordPlot()

##### Save the plot as png file
png(paste0(PlotsDir, "/normalisation.png"), width=900, height=700, pointsize = 14)
par(mfrow=c(2,1), mar=c(2, 5, 3, 2))
  
##### Unnormalised data
boxplot(y[["comb"]]$noNorm, las=2, col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
title(main="Unnormalised data", ylab=params$transform)
legend("topright", legend=legend, fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", cex = 0.7, box.col="transparent")
  
##### Normalised data
boxplot(y[["comb"]]$norm, las=2, col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
title(main=paste0("Normalised data (", params$norm, ")"), ylab=params$transform)
legend("topright", legend=legend, fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", cex = 0.7, box.col="transparent")
invisible(dev.off())

##### Clean the space
rm(den, y)
```

```{r batch_effect_correction, comment = NA, message=FALSE, warning=FALSE, eval=params$batch_rm}
##### The strategy for correcting data for batch effects is to consider the investigated sample and internal reference cohort as one group (batch) (regardless of the investigated patient tissue origin), and TCGA data (of any cancer type) as another batch. The objective is to remove as much as possible data variation due to technical factors.
batches <- as.character(ref_dataset.list[[dataset]][["sample_annot"]]$Dataset)

##### Change the sample dataset name to internal reference cohort
batches[ match(sample_name, batches) ] <- int_cancer_group

##### Perform batch-effect correctrion using limma
ref_dataset.list[[dataset]][["batch_effect_corrected"]] <- limma::removeBatchEffect(ref_dataset.list[[dataset]][["combined_data_processed"]], batch = batches)
```

```{r pca, comment = NA, message=FALSE, warning=FALSE}
suppressMessages(library(plotly))

##### Perform principal component analysis (PCA) using combined-only data and batch-effect corrected data
##### Loop through combined datasets and perform PCA
for ( dataset in names(ref_dataset.list) ) {
  target <- ref_dataset.list[[dataset]][["sample_annot"]]
  target$Dataset <- gsub(sample_name, "Patient", target$Dataset)
  target$Target <- gsub(sample_name, "Patient", target$Target)
  
  if ( params$batch_rm ) {
    ref_dataset.list[[dataset]][["pca_combined_data_processed"]] <- pca(data = ref_dataset.list[[dataset]][["combined_data_processed"]], targets = target, title = "Before batch-effects correction", report_dir = results_dir, suffix = "_before_batch_rm")
    
    ref_dataset.list[[dataset]][["pca_batch_effect_corrected"]] <- pca(data = ref_dataset.list[[dataset]][["batch_effect_corrected"]], targets = target, title = "After batch-effects correction", report_dir = results_dir, suffix = "_after_batch_rm")
    
    ref_dataset.list[[dataset]][["data_to_report"]] <- ref_dataset.list[[dataset]][["batch_effect_corrected"]]
    
  } else {
    ref_dataset.list[[dataset]][["pca_combined_data_processed"]] <- pca(data = ref_dataset.list[[dataset]][["combined_data_processed"]], targets = target, report_dir = results_dir)
    
    ref_dataset.list[[dataset]][["data_to_report"]] <- ref_dataset.list[[dataset]][["combined_data_processed"]]
  }
}
##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

```{r rle, comment = NA, message=FALSE, warning=FALSE, fig.width = 12, fig.height = 6, fig.show="hide"}
##### Generate relative log expression (RLE) plot using combined-only data and batch-effect corrected data
##### Loop through combined datasets and generate RLE plot
for ( dataset in names(ref_dataset.list) ) {
  target <- ref_dataset.list[[dataset]][["sample_annot"]]
  target$Dataset <- gsub(sample_name, "Patient", target$Dataset)
  target$Target <- gsub(sample_name, "Patient", target$Target)
  
  if ( params$batch_rm ) {
    par(mfrow=c(2,1), mar=c(2, 5, 3, 2))
    
    ##### Before batch-effects correction
    plotRLE(ref_dataset.list[[dataset]][["combined_data_processed"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="Before batch-effects correction", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
    
    ref_dataset.list[[dataset]][["rle_combined_data_processed"]] <- recordPlot()
    
    ##### After batch-effects correction
    plotRLE(ref_dataset.list[[dataset]][["batch_effect_corrected"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="After batch-effects correction", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
    
    ref_dataset.list[[dataset]][["rle_batch_effect_corrected"]] <- recordPlot()
    
    
    ##### Save the plot as png file
    png(paste0(PlotsDir, "/rle.png"), width=900, height=700, pointsize = 14)
    par(mfrow=c(2,1), mar=c(2, 5, 3, 2))
  
    ##### Before batch-effects correction
    plotRLE(ref_dataset.list[[dataset]][["combined_data_processed"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="Before batch-effects correction", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
  
    ##### After batch-effects correction
    plotRLE(ref_dataset.list[[dataset]][["batch_effect_corrected"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="After batch-effects correction", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
    invisible(dev.off())

  } else {
    plotRLE(ref_dataset.list[[dataset]][["combined_data_processed"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
    
    ref_dataset.list[[dataset]][["rle_combined_data_processed"]] <- recordPlot()
    
    ##### Save the plot as png file
    png(paste0(PlotsDir, "/rle.png"), width=900, height=450, pointsize = 14)
  
    plotRLE(ref_dataset.list[[dataset]][["combined_data_processed"]], col=targets.colour[[2]], main="", pch="", las=3, xaxt="n", outline = FALSE)
    title(main="", ylab="RLE")
    legend("topright", legend=levels(factor(target$Target)), fill=targets.colour[[1]], horiz=TRUE, bg = "transparent", box.col="transparent")
    invisible(dev.off())
  }
}

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())

##### Clean the space
rm(targets.colour, den, y)
```

```{r gene_annot_count_data, comment = NA, message=FALSE, warning=FALSE}
##### Loop through combined, BUT NOT PROCESSED, datasets and annotate ALL genes. This part is mainly required for biotype detection step
for ( dataset in names(ref_dataset.list) ) {
  
  ##### Convert data into a data frame to make the Ensembl ID and gene symbol matches (with merge function)
  data <- ref_dataset.list[[dataset]][["combined_data"]]
  data.df <- as.data.frame(cbind(rownames(data), data))
  colnames(data.df)[1] <- "ENSEMBL"

  ##### Get genes annotation and genomic locations
  edb <- eval(parse(text = paste0("EnsDb.Hsapiens.v", params$ensembl_version)))
  
  ##### Get keytypes for gene SYMBOL
  keys <- keys(edb, keytype="GENEID")
  
  ##### Get genes genomic coordiantes
  gene_info <- ensembldb::select(edb, keys=keys, columns=c("GENEID", "GENEBIOTYPE", "GENENAME", "SEQNAME", "GENESEQSTART", "GENESEQEND"), keytype="GENEID")
  names(gene_info) <- gsub("GENEID", "ENSEMBL", names(gene_info))
  names(gene_info) <- gsub("GENENAME", "SYMBOL", names(gene_info))
  
  ##### Limit genes annotation to those genes for which sample expression measurments are available
  gene_info <-  gene_info[ gene_info$ENSEMBL %in% data.df$ENSEMBL,  ]
  
  ##### Remove rows with duplicated ENSEMBL IDs
  gene_info = gene_info[!duplicated(gene_info$ENSEMBL),]
  rownames(gene_info) <- gene_info$ENSEMBL
  
  ##### Remove rows with duplicated gene symbols (Y_RNAs, SNORs, LINC0s etc)
  gene_info = gene_info[!duplicated(gene_info$SYMBOL),]
  
  ##### Add info about immune response markers
  gene_info.immune_markers <- merge(gene_info, ref_genes.list[["genes_immune"]]$immune_markers, by = "SYMBOL", all.x = TRUE)
  
  ##### Keep only immune response markers for which there is available annotation
  ref_genes.list[["genes_immune"]]$immune_markers <- ref_genes.list[["genes_immune"]]$immune_markers[ ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL %in% gene_info.immune_markers$SYMBOL, ]
  
  ##### Add info about immunogram genes
  if ( params$immunogram ) {
    gene_info.immunogram <- merge(gene_info, ref_genes.list[["genes_immune"]]$immunogram, by = "SYMBOL", all.x = TRUE)
    gene_info.immunogram <- gene_info.immunogram[!duplicated(gene_info.immunogram[,"ENSEMBL"]),]
    
    ##### Keep only immunogram genes for which there is available annotation
    ref_genes.list[["genes_immune"]]$immunogram <- ref_genes.list[["genes_immune"]]$immunogram[ ref_genes.list[["genes_immune"]]$immunogram$SYMBOL %in% gene_info.immunogram$SYMBOL, ]
    
    ##### Merge genes annotations for immunogram genes and immune markers
    gene_info <- merge( gene_info.immunogram, gene_info.immune_markers[ , c("ENSEMBL", "Immune_Cycle_Role") ], by = "ENSEMBL")
  } else {
    gene_info <- gene_info.immune_markers
  }
  
  ##### Merge genes genomic coordinates info with their annotation and expression data
  data.annot <- merge(gene_info, data.df, by = "ENSEMBL", all.x = FALSE)
  rownames(data.annot) <- data.annot$ENSEMBL
  
  ##### Get data matrix with gene symbols
  if ( params$immunogram ) {
    ref_dataset.list[[dataset]][["gene_annot_all"]] <- data.annot[, c("SYMBOL", "GENEBIOTYPE", "ENSEMBL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "CIC", "Immune_Cycle_Role")]
  } else {
    ref_dataset.list[[dataset]][["gene_annot_all"]] <- data.annot[, c("SYMBOL", "GENEBIOTYPE", "ENSEMBL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "Immune_Cycle_Role")]
  }
  
  ##### Save the combined expression matrix, genes list and associated targets into txt files
  write.table(prepare2write(ref_dataset.list[[dataset]][["combined_data"]]), file = paste0(results_dir, "/", sample_name, ".RNAseq_report.combined_data.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, append = FALSE )
  write.table(prepare2write(ref_dataset.list[[dataset]][["gene_annot_all"]]), file = paste0(results_dir, "/", sample_name, ".RNAseq_report.gene_annot_all.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, append = FALSE )
}

##### Clean the space
rm(data, target, data.df, edb, keys)
```

```{r gene_annot_processed_data, comment = NA, message=FALSE, warning=FALSE}
##### Loop through combined datasets and annotate genes
for ( dataset in names(ref_dataset.list) ) {
  
  ##### Convert data into a data frame to make the Ensembl ID and gene symbol matches (with merge function)
  data <- ref_dataset.list[[dataset]][["data_to_report"]]
  data.df <- as.data.frame(cbind(rownames(data), data))
  colnames(data.df)[1] <- "ENSEMBL"
  
  ##### Merge genes genomic coordinates info with their annotation and expression data
  data.annot <- merge(gene_info, data.df, by = "ENSEMBL", all.x = FALSE)
  
  ##### Keep only genes fo which gene symbol is available
  data.annot <- data.annot[!(is.na(data.annot$SYMBOL) | data.annot$SYMBOL==""), ]
  rownames(data.annot) <- data.annot$SYMBOL
  
  ##### Get data matrix with gene symbols
  ref_dataset.list[[dataset]][["data_to_report"]] <- apply(data.annot[, colnames(data)], 2, as.numeric)
  rownames(ref_dataset.list[[dataset]][["data_to_report"]]) <- data.annot$SYMBOL
  ref_dataset.list[[dataset]][["gene_annot"]] <- data.annot[, c("SYMBOL", "GENEBIOTYPE", "ENSEMBL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "Immune_Cycle_Role")]
  
  ##### Save the combined expression matrix, genes list and associated targets into txt files
  write.table(prepare2write(ref_dataset.list[[dataset]][["data_to_report"]]), file = paste0(results_dir, "/", sample_name, ".RNAseq_report.combined_data_processed.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, append = FALSE )
  write.table(prepare2write(toupper(rownames(ref_dataset.list[[dataset]][["data_to_report"]]))), file = paste0(results_dir, "/", sample_name, ".RNAseq_report.combined_data_processed.genes.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, append = FALSE )
  write.table(prepare2write(ref_dataset.list[[dataset]][["sample_annot"]]), file = paste0(results_dir, "/", sample_name, ".RNAseq_report.sample_annot.txt"), sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE, append = FALSE )
}

##### Clean the space
rm(data, data.df, gene_info)
```

```{r gene_annot_processed_data_save, comment = NA, message=FALSE, warning=FALSE, eval=params$save_tables}
##### Save the entire expression data for all genes measured in patient's sample with cancer genes annotaiton as a data table html file
##### Generate expression summary table for mutated genes
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]

##### Percentiles
genes.expr.perc <- exprTable( genes = rownames(data), keep_all = TRUE, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]], ext_links = TRUE, type = "perc", scaling = scaling)

##### Z-scores
genes.expr.z <- exprTable( genes = rownames(data), keep_all = TRUE, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]], ext_links = TRUE, type = "z", scaling = scaling)

##### Create directory for saving tables
exprTableDir <- paste(results_dir, "exprTables", sep = "/")
    
if ( !file.exists(exprTableDir) ) {
  dir.create(exprTableDir, recursive=TRUE)
}

##### Save the expression tables as html file
saveWidgetFix(widget=genes.expr.perc[[1]], file=paste(exprTableDir, "genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
saveWidgetFix(widget=genes.expr.z[[1]], file=paste(exprTableDir, "genes.expr.z.html", sep = "/"), selfcontained=TRUE)

##### Clean the space
rm(data, targets, genes.expr.z, genes.expr.perc)
```

```{r cn_expr_data_prep, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
##### Combine expression data with mutation and CN data if available
cn_data <- ref_genes.list[["purple"]]
expr_data <- ref_dataset.list[[dataset]][["data_to_report"]]
targets <- ref_dataset.list[[dataset]][["sample_annot"]]

##### ...percerntiles
expr_data.perc <- exprTable( genes = rownames(expr_data), keep_all = TRUE, data = expr_data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, type = "perc", scaling = scaling)[[2]]

expr_genes <- expr_data.perc$SYMBOL

##### Get the "Diff" (Patient vs [comp_cancer]) Z-scores using exprTable function
expr_data.z <- exprTable( genes = expr_genes, keep_all = TRUE, data = expr_data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, type = "z", scaling = scaling)[[2]]

##### Make sure the tables have the same genes order
expr_data.perc <- expr_data.perc[ expr_genes, ]

if ( comp_cancer_group != int_cancer_group ) {
  expr_data.perc <- expr_data.perc[, "Diff" ]
  expr_data.z <- expr_data.z[, "Diff" ]
} else {
  expr_data.perc <- expr_data.perc[, paste0( "Patient vs ", comp_cancer_group)]
  expr_data.z <- expr_data.z[, paste0( "Patient vs ", comp_cancer_group)]
}

names(expr_data.perc) <- expr_genes
names(expr_data.z) <- expr_genes

##### Calculate the mean CN for each gene
cn_data$MeanCopyNumber <- rowMeans(cbind(cn_data$MinCopyNumber, cn_data$MaxCopyNumber))
  
##### Deal with negative CN values
cn_data$MeanCopyNumber[ cn_data$MeanCopyNumber < 0 ] <- 0

##### Remove entries with missing gene symbol (mainly variants in intergenic regions)
cn_data <- cn_data[ cn_data$Gene %!in% "", ]

##### Keep only altered genes with CN values below loss threshold (default 5th percentile) and above gain threshold (default 95th percentile)
cn_data.all <- cn_data

##### Get the percentiles from from the CN values
cn_data.all.percent <- quantile(cn_data.all$MeanCopyNumber, probs = seq(0, 1, .05), na.rm = TRUE)

##### Keep only genes with available expression data
cn_data <- cn_data[ cn_data$Gene %in% names(expr_data.z), ]

##### Add mutation data if available
if ( !is.null(ref_genes.list[["pcgr"]]) ) {
  mut_data <- ref_genes.list[["pcgr"]]
  
  ##### Remove entries with missing gene symbol (mainly variants in intergenic regions)
  mut_data <- mut_data[ mut_data$SYMBOL %!in% "", ]

  ##### Prepare mutation data to include multiple mutations per gene
  ##### Initiate variable for the gene mutation status for each gene
  gene.mut <- as.matrix(rep("None", length(expr_data.z)))
  colnames(gene.mut) <- "Alterations"
  rownames(gene.mut) <- names(expr_data.z)

  for ( i in 1:nrow(gene.mut) ) {
    ##### Check if any mutations are reported for each gene
    if (  rownames(gene.mut)[i] %in% mut_data$SYMBOL ) {
    
      ##### Deal with multiple mutations per gene
      if ( length(mut_data[ mut_data$SYMBOL %in% rownames(gene.mut)[i],  ]$CONSEQUENCE) > 1 ) {
        gene.mut[ rownames(gene.mut)[i],"Alterations" ] <- "Mutation: multiple hits"
      } else {
        gene.mut[ rownames(gene.mut)[i],"Alterations" ] <- paste0("Mutation: ", mut_data[ mut_data$SYMBOL %in% rownames(gene.mut)[i],  ]$CONSEQUENCE)
      }
    }
  }

  ##### If there is no expression value for a specific gene than assume it's not expressed at all and assign the lowest value observed in that sample
  for ( gene in unique(mut_data$SYMBOL) ) {
    if ( gene %!in% rownames(gene.mut) ) {
      
      expr_data.perc <- c(expr_data.perc, min(expr_data.perc))
      names(expr_data.perc)[length(expr_data.perc)] <- gene
      
      expr_data.z <- c(expr_data.z, min(expr_data.z))
      names(expr_data.z)[length(expr_data.z)] <- gene
      
      ##### Deal with multiple mutations per gene
      if ( length(mut_data[ mut_data$SYMBOL %in% gene,  ]$CONSEQUENCE) > 1 ) {
        gene.mut <- rbind( gene.mut,  "multiple hits")
      } else {
        gene.mut <- rbind( gene.mut,  mut_data[ mut_data$SYMBOL %in% gene,  ]$CONSEQUENCE )
      }
      rownames(gene.mut)[nrow(gene.mut)] <- gene
    }
  }

  ##### Subset expression, mutation and copy-number data to include only overlapping genes
  genes.intersect <- intersect(intersect(rownames(gene.mut), cn_data$Gene), names(expr_data.perc))
  
  gene.mut.sub <- gene.mut[ rownames(gene.mut) %in% genes.intersect, ]
  cn_data.sub <- cn_data[ cn_data$Gene %in% genes.intersect, ]
  expr_data.perc.sub <- expr_data.perc[ names(expr_data.perc) %in% genes.intersect ]
  expr_data.z.sub <- expr_data.z[ names(expr_data.z) %in% genes.intersect ]
  
  ##### Make sure thay are all in the same order
  gene.mut.sub <- gene.mut.sub[ genes.intersect ]
  rownames(cn_data.sub) <- cn_data.sub$Gene
  cn_data.sub <- cn_data.sub[ genes.intersect,  ]
  expr_data.perc.sub <- expr_data.perc.sub[ genes.intersect  ]
  expr_data.z.sub <- expr_data.z.sub[ genes.intersect  ]
  
  ##### Prepare data frame
  cn_data.sub <- data.frame(names(expr_data.z.sub), cn_data.sub$MeanCopyNumber, expr_data.perc.sub, expr_data.z.sub, gene.mut.sub)
  colnames(cn_data.sub) <- c("Gene", "CN", "Perc_diff", "Z_score_diff", "Alterations")
  
} else {
  ##### Skip the step for processing mutation info and deal with expression and copy-number data
  ##### Subset expression and copy-number data to include only overlapping genes
  genes.intersect <- intersect(cn_data$Gene, names(expr_data.perc))
  
  cn_data.sub <- cn_data[ cn_data$Gene %in% genes.intersect, ]
  expr_data.perc.sub <- expr_data.perc[ names(expr_data.perc) %in% genes.intersect ]
  expr_data.z.sub <- expr_data.z[ names(expr_data.z) %in% genes.intersect ]
  
  ##### Make sure thay are all in the same order
  rownames(cn_data.sub) <- cn_data.sub$Gene
  cn_data.sub <- cn_data.sub[ genes.intersect,  ]
  expr_data.perc.sub <- expr_data.perc.sub[ genes.intersect  ]
  expr_data.z.sub <- expr_data.z.sub[ genes.intersect  ]
  
  ##### Prepare data frame
  cn_data.sub <- data.frame(names(expr_data.z.sub), cn_data.sub$MeanCopyNumber, expr_data.perc.sub, expr_data.z.sub)
  colnames(cn_data.sub) <- c("Gene", "CN", "Perc_diff", "Z_score_diff")
}

ref_dataset.list[[dataset]][["expr_mut_cn_data_all"]] <- cn_data.sub

##### Limit the data to include only cancer genes
cn_data.sub <- cn_data.sub[ cn_data.sub$Gene %in% rownames(ref_genes.list[["genes_cancer"]]), ]

##### Keep genes meeting the user-defined CN values thresholds
ref_dataset.list[[dataset]][["expr_mut_cn_data"]] <- cn_data.sub[ cn_data.sub$CN <= cn_bottom | cn_data.sub$CN >= cn_top, ]

##### Clean the space
rm(cn_data, cn_data.sub, expr_data, gene.mut, mut_data, targets, expr_data.z, expr_data.perc, expr_data.z.sub, expr_data.perc.sub, expr_genes, gene.mut.sub, genes.intersect)
```

```{r cn_data_distribution, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
suppressMessages(library(plotly))

##### Draw histogram of CN data
cn_dist_plot <- plot_ly(x = cn_data.all$MeanCopyNumber, type = 'histogram', name = "CN data", width = 800, height = 300) %>%
  
  ##### Add 5th percentile threshold
  add_lines(y = seq(0,1000, 100), x = rep(cn_data.all.percent[2],11), 
              line = list(color = "black", dash = "dash"), opacity = 0.4,
              name = "5th percentile", showlegend = TRUE) %>%
  
  ##### Add 50th percentile
  add_lines(y = seq(0,1000, 100), x = rep(cn_data.all.percent[11],11), 
              line = list(color = "black", dash = "dash"), opacity = 0.7,
              name = "50th percentile", showlegend = TRUE) %>%
  
  ##### Add 95th percentile threshold
  add_lines(y = seq(0,1000, 100), x = rep(cn_data.all.percent[20],11), 
              line = list(color = "black", dash = "dash"), opacity = 1,
              name = "95th percentile", showlegend = TRUE) %>%
  
  layout(xaxis = list( title = "CN values"), yaxis = list( title = "Frequency"), margin = list(l=50, r=50, b=50, t=50, pad=4), autosize = F)

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())

##### Clean the space
rm(cn_data.all)
```

```{r known_fusions_prep, comment = NA, message=FALSE, warning=FALSE, eval = runFusionChunk}
##### Flag known fusions based on info from Cancer Biomarkers database (CGI) (https://www.cancergenomeinterpreter.org/biomarkers)
known_translocations.CGI <- caner_genes_annot.list[["cancer_biomarkers_trans"]]
known_translocations.CGI$cancer_acronym <- gsub(";", ", ", known_translocations.CGI$cancer_acronym)
known_translocations.CGI$source <- gsub(";", ", ", known_translocations.CGI$source)
known_translocations.CGI$translocation <- gsub("__", "_", known_translocations.CGI$translocation)
  
##### Flag known fusions based on info from FusionGDB (https://ccsm.uth.edu/FusionGDB)
known_translocations.FusionGDB <- caner_genes_annot.list[["FusionGDB"]]
  
##### Merge info from both resources
known_translocations <- merge(known_translocations.FusionGDB, known_translocations.CGI, by.x = "FGname", by.y = "translocation", all = TRUE, sort=FALSE)
  
##### Extract gene pairs involved in reported gene fusions
trans.pairs <- as.data.frame(cbind( known_translocations$FGname, known_translocations$FGname ))
names(trans.pairs) <- c("geneA", "geneB")
trans.pairs$geneA <- sub("_.*", "", trans.pairs$geneA)
trans.pairs$geneB <- sub(".*_", "", trans.pairs$geneB)
known_translocations <- cbind(known_translocations, trans.pairs)
trans.pairs <- apply( trans.pairs , 1 , paste , collapse = "-" )
```

```{r arriba_filtering, comment = NA, message=FALSE, warning=FALSE, eval = runArribaChunk}
##### Read in the arriba fusion calls
arriba.fusions <- ref_genes.list[["arriba"]]
colnames(arriba.fusions) <- gsub("X.gene1", "geneA", colnames(arriba.fusions))
colnames(arriba.fusions) <- gsub("1", "A", colnames(arriba.fusions))
colnames(arriba.fusions) <- gsub("2", "B", colnames(arriba.fusions))

#####  Note the fusions order, which will be later required for imbedding Arriba plots from corresponding pdf booklet pages
arriba.fusions.order <- paste(arriba.fusions$geneA, arriba.fusions$geneB, sep="__")

##### Extract only those fusion genes that are in cancer genes list
arriba.cancer_genes <- data.frame()

for (row in 1:nrow(arriba.fusions)){
  if(arriba.fusions[row,"geneA"] %in% rownames(ref_genes.list[["genes_cancer"]]) | arriba.fusions[row,"geneB"] %in% rownames(ref_genes.list[["genes_cancer"]])) {
    
    ##### Creating a new dataframe for extracting arriba rows with cancer gene hits
    arriba.cancer_genes <- rbind(arriba.cancer_genes, data.frame(arriba.fusions[row,]))
  }
}

##### Add columns for info about reported fusions
fusions <- cbind(arriba.fusions, data.frame(matrix("", ncol = 5, nrow = nrow(arriba.fusions)), stringsAsFactors = FALSE))
colnames(fusions)[(ncol(fusions)-4):ncol(fusions)] <- c("FGID", "reported_fusion", "reported_fusion_geneA", "reported_fusion_geneB", "effector_gene")
  
##### Add annotations about known fusion events
##### Loop through all genes involved in deteced gene fusions (arriba results) and check which are already reported
for ( i in 1:nrow(fusions) ) {
  geneA <- as.character(fusions$geneA[i])
  geneB <- as.character(fusions$geneB[i])
          
  ##### First check if the exact reported gene pairs were detected by arriba
  if ( paste(geneA, geneB, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneA, geneB, sep="-")  ]
      
    fusions$reported_fusion_geneA[i] <- "Yes"
    fusions$reported_fusion_geneB[i] <- "Yes"
      
  } else if ( paste(geneB, geneA, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneB, geneA, sep="-")  ]
      
    fusions$reported_fusion_geneA[i] <- "Yes"
    fusions$reported_fusion_geneB[i] <- "Yes"
      
  ##### Now check if any ofthe arriba detected fusion genes are reported
  } else {
    fusions$reported_fusion[i] <- "-"
      
    ##### Check the Cancer Genome Interpreter (CGI) database first
    ##### Check arriba genes A and genes A in reported fusions
    if ( geneA %in% known_translocations$geneA ) {
       fusions$reported_fusion_geneA[i] <- "Yes"
        
    ##### Check arriba genes A and genes B in reported fusions
    } else if ( geneA %in% known_translocations$geneB ) {
      fusions$reported_fusion_geneA[i] <- "Yes"
    }
      
    ##### Check arriba genes B and genes A in reported fusions
    if ( geneB %in% known_translocations$geneA ) {
      fusions$reported_fusion_geneB[i] <- "Yes"
        
    ##### Check arriba genes B and genes B in reported fusions
    } else if ( geneB %in% known_translocations$geneB ) {
      fusions$reported_fusion_geneB[i] <- "Yes"
    }
      
    ##### Flag if any of the genes are effector gene
    if ( geneA %in% known_translocations$effector_gene  ) {
      fusions$effector_gene[i] <- geneA
    } else if ( geneB == known_translocations$effector_gene  ) {
      fusions$effector_gene[i] <- geneB
    }
  }
}

##### Sum split reads in gene A and B
fusions$split_reads <- fusions$split_readsA + fusions$split_readsB

##### Add column indicating fusions containing known cancer genes
fusions$fusions_cancer <- c(rep("-", nrow(fusions)))

if ( nrow(arriba.cancer_genes) > 0 ) {
  fusions$fusions_cancer[ fusions$geneA %in% arriba.cancer_genes$geneA ] <- "Yes"
  fusions$fusions_cancer[ fusions$geneB %in% arriba.cancer_genes$geneB ] <- "Yes"
}

##### Re-ordering arriba's results on the basis of Arriba's confidence, reported fusions and then read count values (first by split count and then paircount) and then involvment of cancer genes and reported one of the fusion genes
fusions <- fusions[ order(fusions$reported_fusion, fusions$split_reads, fusions$split_readsA, fusions$split_readsB, fusions$discordant_mates, fusions$fusions_cancer, fusions$reported_fusion_geneA, fusions$reported_fusion_geneB, decreasing = TRUE), ]
fusions <- fusions[order(factor(fusions$confidence, levels=c("high", "medium", "low"))), ]

##### Keep only key columns and add info about Arriba detected fusions and 
fusions <- fusions[ colnames(fusions) %in% c("geneA", "geneB", "breakpointA", "breakpointB", "siteA", "siteB", "type", "split_reads", "split_readsA", "split_readsB", "discordant_mates", "confidence", "FGID", "reported_fusion", "reported_fusion_geneA", "reported_fusion_geneB", "effector_gene", "fusions_cancer")]

##### Add column to flag fusions supported by WGS data (from MANTA), if available
fusions$geneA_dna_support <- "-"
fusions$geneB_dna_support <- "-"

if ( runPizzlyChunk || runDragenFusionChunk ) {
  fusions$Arriba <- c(rep("Yes", nrow(fusions)))
}

##### Clean the space and return output
rm(arriba.fusions, arriba.fusion.transcripts, arriba.cancer_genes, arriba.other_genes)
```

```{r dragen_filtering, comment = NA, message=FALSE, warning=FALSE, eval = runDragenFusionChunk}
##### Read in the arriba fusion calls
dragen.fusions <- ref_genes.list[["dragenFusion"]]
colnames(dragen.fusions) <- gsub("gene1", "geneA", colnames(dragen.fusions))
colnames(dragen.fusions) <- gsub("1", "A", colnames(dragen.fusions))
colnames(dragen.fusions) <- gsub("2", "B", colnames(dragen.fusions))

##### Extract only those fusion genes that are in cancer genes list
dragen.cancer_genes <- data.frame()

for (row in 1:nrow(dragen.fusions)){
  if(dragen.fusions[row,"geneA"] %in% rownames(ref_genes.list[["genes_cancer"]]) | dragen.fusions[row,"geneB"] %in% rownames(ref_genes.list[["genes_cancer"]])) {
    
    ##### Creating a new dataframe for extracting dragen rows with cancer gene hits
    dragen.cancer_genes <- rbind(dragen.cancer_genes, data.frame(dragen.fusions[row,]))
  }
}

##### Add columns for info about reported fusions
dragen.fusions <- cbind(dragen.fusions, data.frame(matrix("", ncol = 5, nrow = nrow(dragen.fusions)), stringsAsFactors = FALSE))
colnames(dragen.fusions)[(ncol(dragen.fusions)-4):ncol(dragen.fusions)] <- c("FGID", "reported_fusion", "reported_fusion_geneA", "reported_fusion_geneB", "effector_gene")

##### Add annotations about known fusion events
##### Loop through all genes involved in deteced gene fusions (dragen results) and check which are already reported
for ( i in 1:nrow(dragen.fusions) ) {
  geneA <- as.character(dragen.fusions$geneA[i])
  geneB <- as.character(dragen.fusions$geneB[i])
          
  ##### First check if the exact reported gene pairs were detected by dragen
  if ( paste(geneA, geneB, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    dragen.fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    dragen.fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneA, geneB, sep="-")  ]
      
    dragen.fusions$reported_fusion_geneA[i] <- "Yes"
    dragen.fusions$reported_fusion_geneB[i] <- "Yes"
      
  } else if ( paste(geneB, geneA, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    dragen.fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    dragen.fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneB, geneA, sep="-")  ]
      
    dragen.fusions$reported_fusion_geneA[i] <- "Yes"
    dragen.fusions$reported_fusion_geneB[i] <- "Yes"
      
  ##### Now check if any of the dragen detected fusion genes are reported
  } else {
    dragen.fusions$reported_fusion[i] <- "-"
      
    ##### Check the Cancer Genome Interpreter (CGI) database first
    ##### Check dragen genes A and genes B in reported fusions
    if ( geneA %in% known_translocations$geneA ) {
       dragen.fusions$reported_fusion_geneA[i] <- "Yes"
        
    ##### Check dragen genes A and genes B in reported fusions
    } else if ( geneA %in% known_translocations$geneB ) {
      dragen.fusions$reported_fusion_geneA[i] <- "Yes"
    }
      
    ##### Check dragen genes B and genes A in reported fusions
    if ( geneB %in% known_translocations$geneA ) {
      dragen.fusions$reported_fusion_geneB[i] <- "Yes"
        
    ##### Check dragen genes B and genes A in reported fusions
    } else if ( geneB %in% known_translocations$geneB ) {
      dragen.fusions$reported_fusion_geneB[i] <- "Yes"
    }
      
    ##### Flag if any of the genes are effector gene
    if ( geneA %in% known_translocations$effector_gene  ) {
      dragen.fusions$effector_gene[i] <- geneA
    } else if ( geneB == known_translocations$effector_gene  ) {
      dragen.fusions$effector_gene[i] <- geneB
    }
  }
}

##### Add column indicating fusions containing known cancer genes
dragen.fusions$fusions_cancer <- c(rep("-", nrow(dragen.fusions)))

if ( nrow(dragen.cancer_genes) > 0 ) {
  dragen.fusions$fusions_cancer[ dragen.fusions$geneA %in% dragen.cancer_genes$geneA ] <- "Yes"
  dragen.fusions$fusions_cancer[ dragen.fusions$geneB %in% dragen.cancer_genes$geneB ] <- "Yes"
}

##### Re-ordering dragen's results on the basis of Dragen's confidence, reported fusions and then score (as Dragen doesn't includes split count and paircount info) and then involvment of cancer genes and reported one of the fusion genes
dragen.fusions <- dragen.fusions[ order(dragen.fusions$reported_fusion, dragen.fusions$Score, dragen.fusions$fusions_cancer, dragen.fusions$reported_fusion_geneA, dragen.fusions$reported_fusion_geneB, decreasing = TRUE), ]
#dragen.fusions <- dragen.fusions[order(factor(dragen.fusions$confidence, levels=c("high", "medium", "low"))), ]

##### Keep only key columns
dragen.fusions <- dragen.fusions[ colnames(dragen.fusions) %in% c("geneA", "geneB", "Score", "LeftBreakpoint", "RightBreakpoint", "GeneALocation", "GeneBLocation", "NumSplitReads", "NumSoftClippedReads", "FGID", "reported_fusion", "reported_fusion_geneA", "reported_fusion_geneB", "effector_gene", "fusions_cancer")]

##### Add column to flag fusions supported by WGS data (from MANTA), if available
dragen.fusions$geneA_dna_support <- "-"
dragen.fusions$geneB_dna_support <- "-"

##### Add results from Arriba
if ( runArribaChunk ) {
  
  #####  Dragen's fusion format version 3.9.3
  if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {

    ##### Add column with Dragen fusions
    dragen.fusions$fusion <- paste(dragen.fusions$geneA, dragen.fusions$geneB, sep="__")
    fusions$Dragen <- c(rep("-", nrow(fusions)))
    fusions$split_reads <- fusions$split_readsA + fusions$split_readsB
    fusions$soft_clipped_reads <- c(rep("-", nrow(fusions)))
    fusions$score <- c(rep("-", nrow(fusions)))
    
    ##### Re-order columns
    fusions <- fusions %>% dplyr::relocate(split_reads, .before = split_readsA)
    fusions <- fusions %>% dplyr::relocate(soft_clipped_reads, .before = confidence)
    fusions <- fusions %>% dplyr::relocate(score, .before = FGID)
    
    ##### Loop through Dragen results, mark fusions detected by both tools. For those detected only by Dragen adapt results format to Arriba results
    for ( i in 1:nrow(dragen.fusions) ) {
      
      if ( !is.na(match(dragen.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__"))) ) {
        fusions$Dragen[ match(dragen.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__")) ] <- "Yes"
        dragen.fusions[ i, ] <-  rep("-", ncol(dragen.fusions))
      } else {
        fusions <- rbind(fusions, data.frame(geneA=dragen.fusions$geneA[i],geneB=dragen.fusions$geneB[i], breakpointA=dragen.fusions$LeftBreakpoint[i], breakpointB=dragen.fusions$RightBreakpoint[i], siteA=dragen.fusions$GeneALocation[i], siteB=dragen.fusions$GeneBLocation[i], type="-", split_reads=dragen.fusions$NumSplitReads[i], split_readsA="-", split_readsB="-", discordant_mates="-", soft_clipped_reads=dragen.fusions$NumSoftClippedReads[i], confidence="-", score=dragen.fusions$Score[i], FGID=dragen.fusions$FGID[i], reported_fusion=dragen.fusions$reported_fusion[i], reported_fusion_geneA=dragen.fusions$reported_fusion_geneA[i], reported_fusion_geneB=dragen.fusions$reported_fusion_geneB[i], effector_gene=dragen.fusions$effector_gene[i], fusions_cancer=dragen.fusions$fusions_cancer[i], geneA_dna_support="-", geneB_dna_support="-", Arriba="-", Dragen="Yes" ))
      }
    }
  
  #####  Dragen's fusion format prior to version 3.9.3
  } else {
    
    ##### Add column with Dragen fusions
    dragen.fusions$fusion <- paste(dragen.fusions$geneA, dragen.fusions$geneB, sep="__")
    fusions$Dragen <- c(rep("-", nrow(fusions)))
    fusions$split_reads <- fusions$split_readsA + fusions$split_readsB
    fusions$score <- c(rep("-", nrow(fusions)))
    
    ##### Re-order columns
    fusions <- fusions %>% dplyr::relocate(split_reads, .before = split_readsA)
    fusions <- fusions %>% dplyr::relocate(score, .before = FGID)
      
    ##### Loop through Dragen results, mark fusions detected by both tools. For those detected only by Dragen adapt results format to Arriba results
    for ( i in 1:nrow(dragen.fusions) ) {
        
      if ( !is.na(match(dragen.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__"))) ) {
        fusions$Dragen[ match(dragen.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__")) ] <- "Yes"
        dragen.fusions[ i, ] <-  rep("-", ncol(dragen.fusions))
      } else {
        fusions <- rbind(fusions, data.frame(geneA=dragen.fusions$geneA[i],geneB=dragen.fusions$geneB[i], breakpointA=dragen.fusions$LeftBreakpoint[i], breakpointB=dragen.fusions$RightBreakpoint[i], siteA="-", siteB="-", type="-", split_reads="-", split_readsA="-", split_readsB="-", discordant_mates="-", confidence="-", score=dragen.fusions$Score[i], FGID=dragen.fusions$FGID[i], reported_fusion=dragen.fusions$reported_fusion[i], reported_fusion_geneA=dragen.fusions$reported_fusion_geneA[i], reported_fusion_geneB=dragen.fusions$reported_fusion_geneB[i], effector_gene=dragen.fusions$effector_gene[i], fusions_cancer=dragen.fusions$fusions_cancer[i], geneA_dna_support="-", geneB_dna_support="-", Arriba="-", Dragen="Yes" ))
      }
    }
  }
  
##### Otherwise add empty columns expected from Aribba results
} else {
  fusions <- dragen.fusions
  
  #####  Dragen's fusion format version 3.9.3
  if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {
    
    ##### Rename columns
    names(fusions) <- gsub("LeftBreakpoint", "breakpointA", names(fusions))
    names(fusions) <- gsub("RightBreakpoint", "breakpointB", names(fusions))
    names(fusions) <- gsub("GeneALocation", "siteA", names(fusions))
    names(fusions) <- gsub("GeneBLocation", "siteB", names(fusions))
    names(fusions) <- gsub("NumSplitReads", "split_reads", names(fusions))
    names(fusions) <- gsub("NumSoftClippedReads", "soft_clipped_reads", names(fusions))
    names(fusions) <- gsub("Score", "score", names(fusions))
  
  #####  Dragen's fusion format prior to version 3.9.3
  } else {
    
    ##### Rename columns
    names(fusions) <- gsub("LeftBreakpoint", "breakpointA", names(fusions))
    names(fusions) <- gsub("RightBreakpoint", "breakpointB", names(fusions))
    names(fusions) <- gsub("Score", "score", names(fusions))
  }
}

##### Clean the space and return output
rm(dragen.fusion.transcripts, dragen.cancer_genes, dragen.other_genes)
```

```{r pizzly_filtering, comment = NA, message=FALSE, warning=FALSE, eval = runPizzlyChunk}
##### Read in the pizzly fusion calls
pizzly.fusion.candidates <- ref_genes.list[["pizzly"]]

##### Extract only those fusion genes that are in cancer genes list
pizzly.cancer_genes <- data.frame()

for (row in 1:nrow(pizzly.fusion.candidates)){
  if(pizzly.fusion.candidates[row,"geneA.name"] %in% rownames(ref_genes.list[["genes_cancer"]]) | pizzly.fusion.candidates[row,"geneB.name"] %in% rownames(ref_genes.list[["genes_cancer"]])) {
    
    ##### Creating a new dataframe for extracting pizzly rows with cancer gene hits
    pizzly.cancer_genes <- rbind(pizzly.cancer_genes, data.frame(pizzly.fusion.candidates[row,]))
  }
}

##### Extracting rows from pizzly results that are not cancer genes list
pizzly.other_genes <- pizzly.fusion.candidates[ rownames(pizzly.fusion.candidates) %!in% rownames(pizzly.cancer_genes), ]
  
##### Combing all the three above sorted dataframes
pizzly.fusions <- rbind(pizzly.cancer_genes, pizzly.other_genes)
  
##### Flag known fusions based on info from Cancer Biomarkers database (CGI) and FusionGDB (https://ccsm.uth.edu/FusionGDB)
##### Add columns for info about reported fusions
pizzly.fusions <- cbind(pizzly.fusions, data.frame(matrix("", ncol = 5, nrow = nrow(pizzly.fusions)), stringsAsFactors = FALSE))
colnames(pizzly.fusions)[(ncol(pizzly.fusions)-4):ncol(pizzly.fusions)] <- c("FGID", "reported_fusion", "reported_fusion_geneA", "reported_fusion_geneB", "effector_gene")
  
##### Add annotations about known fusion events
##### Loop through all genes involved in deteced gene fusions (pizzly results) and check which are already reported
for ( i in 1:nrow(pizzly.fusions) ) {
  geneA <- as.character(pizzly.fusions$geneA.name[i])
  geneB <- as.character(pizzly.fusions$geneB.name[i])
          
  ##### First check if the exact reported gene pairs were detected by pizzly
  if ( paste(geneA, geneB, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    pizzly.fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    pizzly.fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneA, geneB, sep="-")  ]
      
    pizzly.fusions$reported_fusion_geneA[i] <- "Yes"
    pizzly.fusions$reported_fusion_geneB[i] <- "Yes"
      
  } else if ( paste(geneB, geneA, sep="-") %in% trans.pairs ) {
      
    ##### provide fusion URL to FusionGDB
    pizzly.fusions$reported_fusion[i] <- "Yes"
      
    ##### provide fusion ID from FusionGDB
    pizzly.fusions$FGID[i] <- known_translocations$FGID[ trans.pairs %in% paste(geneB, geneA, sep="-")  ]
      
    pizzly.fusions$reported_fusion_geneA[i] <- "Yes"
    pizzly.fusions$reported_fusion_geneB[i] <- "Yes"
      
  ##### Now check if any ofthe pizzly detected fusion genes are reported
  } else {
    pizzly.fusions$reported_fusion[i] <- "-"
      
    ##### Check the Cancer Genome Interpreter (CGI) database first
    ##### Check pizzly genes A and genes A in reported fusions
    if ( geneA %in% known_translocations$geneA ) {
       pizzly.fusions$reported_fusion_geneA[i] <- "Yes"
        
    ##### Check pizzly genes A and genes B in reported fusions
    } else if ( geneA %in% known_translocations$geneB ) {
      pizzly.fusions$reported_fusion_geneA[i] <- "Yes"
    }
      
    ##### Check pizzly genes B and genes A in reported fusions
    if ( geneB %in% known_translocations$geneA ) {
      pizzly.fusions$reported_fusion_geneB[i] <- "Yes"
        
    ##### Check pizzly genes B and genes B in reported fusions
    } else if ( geneB %in% known_translocations$geneB ) {
      pizzly.fusions$reported_fusion_geneB[i] <- "Yes"
    }
      
    ##### Flag if any of the genes are effector gene
    if ( geneA %in% known_translocations$effector_gene  ) {
      pizzly.fusions$effector_gene[i] <- geneA
    } else if ( geneB == known_translocations$effector_gene  ) {
      pizzly.fusions$effector_gene[i] <- geneB
    }
  }
}
  
##### Add column indicating fusions containing known cancer genes
pizzly.fusions$fusions_cancer <- c(rep("-", nrow(pizzly.fusions)))

if ( nrow(pizzly.cancer_genes) > 0 ) {
  pizzly.fusions$fusions_cancer[ pizzly.fusions$geneA.name %in% pizzly.cancer_genes$geneA.name ] <- "Yes"
  pizzly.fusions$fusions_cancer[ pizzly.fusions$geneB.name %in% pizzly.cancer_genes$geneB.name ] <- "Yes"
}

##### Re-order fusion genes based on the reported fusions column
pizzly.fusions <- pizzly.fusions[ order(pizzly.fusions$reported_fusion, pizzly.fusions$splitcount, pizzly.fusions$paircount, pizzly.fusions$fusions_cancer, pizzly.fusions$reported_fusion_geneA, pizzly.fusions$reported_fusion_geneB, decreasing = TRUE), ]

##### Rename columns to match Arriba results
colnames(pizzly.fusions) <- gsub("geneA.name", "geneA", colnames(pizzly.fusions))
colnames(pizzly.fusions) <- gsub("geneB.name", "geneB", colnames(pizzly.fusions))
colnames(pizzly.fusions) <- gsub("paircount", "discordant_mates", colnames(pizzly.fusions))
colnames(pizzly.fusions) <- gsub("splitcount", "split_reads", colnames(pizzly.fusions))
pizzly.fusions <- pizzly.fusions[ colnames(pizzly.fusions) %!in% c("geneA.id", "geneB.id", "transcripts.list")]

##### Add results from Arriba
if ( runArribaChunk ) {
  
  ##### Add column with pizzly fusions
  pizzly.fusions$fusion <- paste(pizzly.fusions$geneA, pizzly.fusions$geneB, sep="__")
  fusions$Pizzly <-  c(rep("-", nrow(fusions)))
  
  ##### Loop through Pizzly results, mark fusions detected by both tools. For those detected only by pizzly adapt results format to Arriba results
  for ( i in 1:nrow(pizzly.fusions) ) {
    
    if ( !is.na(match(pizzly.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__"))) ) {
      fusions$Pizzly[ match(pizzly.fusions$fusion[i], paste(fusions$geneA, fusions$geneB, sep="__")) ] <- "Yes"
      pizzly.fusions[ i, ] <-  rep("-", ncol(pizzly.fusions))
    } else {
      fusions <- rbind(fusions, data.frame(geneA=pizzly.fusions$geneA[i],geneB=pizzly.fusions$geneB[i], breakpointA="-", breakpointB="-", siteA="-", siteB="-", type="-", split_reads=pizzly.fusions$split_reads[i], split_readsA="-", split_readsB="-", discordant_mates=pizzly.fusions$discordant_mates[i], confidence="-", FGID=pizzly.fusions$FGID[i], reported_fusion=pizzly.fusions$reported_fusion[i], reported_fusion_geneA=pizzly.fusions$reported_fusion_geneA[i], reported_fusion_geneB=pizzly.fusions$reported_fusion_geneB[i], effector_gene=pizzly.fusions$effector_gene[i], fusions_cancer=pizzly.fusions$fusions_cancer[i], geneA_dna_support="-", geneB_dna_support="-", Arriba="-", Pizzly="Yes" ))
    }
  }
##### Otherwise add empty columns expected from Aribba results
} else {
  fusions <- pizzly.fusions
}

##### Add column to flag fusions supported by WGS data (from MANTA), if available
fusions$geneA_dna_support <- "-"
fusions$geneB_dna_support <- "-"

##### Clean the space and return output
rm(pizzly.fusions, pizzly.fusion.transcripts, pizzly.fusion.candidates, known_translocations.CGI, known_translocations.FusionGDB, pizzly.cancer_genes, pizzly.other_genes, trans.pairs)
```

```{r fusions_annot, comment = NA, message=TRUE, warning=FALSE, eval = runFusionChunk}
##### Annotate fusion genes
##### Get data to annotate fusion genes
fusion_genes_annot <- ref_dataset.list[[dataset]][["gene_annot_all"]][ , c("ENSEMBL", "SYMBOL", "SEQNAME", "GENESEQSTART", "GENESEQEND") ]

fusions.annot <- fusions
fusions.annot$order <- 1:nrow(fusions.annot)

##### Get genomic info for fusions genes
fusion_annot1 <- merge(fusion_genes_annot, fusions.annot[ , c("order","geneA")], by = 2, sort=FALSE, all.y = TRUE)
fusion_annot1 <- fusion_annot1[ order(fusion_annot1$order), ]
fusion_annot2 <- merge(fusion_genes_annot, fusions.annot[ , c("order","geneB")], by = 2, sort=FALSE, all.y = TRUE)
fusion_annot2 <- fusion_annot2[ order(fusion_annot2$order), ]

##### Dragen + Arriba
if ( runDragenFusionChunk && runArribaChunk ) {
  fusion_annot <- cbind(fusion_annot1, fusion_annot2, fusions.annot[, c("score", "breakpointA", "breakpointB", "discordant_mates", "split_reads",  "split_readsA", "split_readsB", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB")])
  
##### Arriba / Arriba + Pizzly
} else if ( runArribaChunk ) {
  fusion_annot <- cbind(fusion_annot1, fusion_annot2, fusions.annot[, c("breakpointA", "breakpointB", "discordant_mates", "split_reads", "split_readsA", "split_readsB", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB")])
  
##### Dragen only
} else if ( runDragenFusionChunk ) {
  
  #####  Dragen's fusion format version 3.9.3
  if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {
    fusion_annot <- cbind(fusion_annot1, fusion_annot2, fusions.annot[, c("score", "breakpointA", "breakpointB", "split_reads", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB")])
    
  #####  Dragen's fusion format prior to version 3.9.3
  } else {
    fusion_annot <- cbind(fusion_annot1, fusion_annot2, fusions.annot[, c("score", "breakpointA", "breakpointB", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB")])
  }
  
##### Pizzly only
} else {
  fusion_annot <- cbind(fusion_annot1, fusion_annot2, fusions.annot[, c("split_reads", "discordant_mates", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB")])
}

##### Add column to flag fusions supported by WGS data (from MANTA), if available
fusion_annot$geneA_dna_support <- "-"
fusion_annot$geneB_dna_support <- "-"

colnames(fusion_annot) = make.names(colnames(fusion_annot), unique=TRUE)

##### Remove entries with missing annotation
fusion_annot <- fusion_annot[complete.cases(fusion_annot), ]

##### Clean the space
rm(fusion_annot1, fusion_annot2, fusions.annot, fusion_genes_annot)
```

```{r fusions_and_manta_data_prep, comment = NA, message=FALSE, warning=FALSE, eval = runSVsChunk}
##### Compare PIZZY and MANTA called gene fusion events
##### Add row for gene fusion events so that there is one row per gene
manta_sv <- ref_genes.list[["manta"]]
manta_sv$"Fusion genes" <- manta_sv$Gene

i <- 1
while ( i <= nrow(manta_sv) ) {
  if ( length(strsplit(manta_sv$Gene[i], split='&', fixed=TRUE)[[1]]) > 1 ) {
     
    ##### Insert new row for events involving two genes
    manta_sv <- tibble::add_row(manta_sv, .after = i)
    manta_sv[i+1, ] <- manta_sv[i, ]
    manta_sv$Gene[i] <- strsplit(manta_sv$Gene[i], split='&', fixed=TRUE)[[1]][1]
    manta_sv$Gene[i+1] <- strsplit(manta_sv$Gene[i+1], split='&', fixed=TRUE)[[1]][2]
    
    i <- i + 2
    
  } else {
    manta_sv$"Fusion genes"[i] <- ""
    i <- i + 1
  }
}

##### Compare fusion genes called by PIZZLy and MANTA
##### First limit MANTA output to fusions only
if ( runFusionChunk ) {
  manta_fusions <- unique(manta_sv[ grep("&", manta_sv$"Fusion genes"),  ]$Gene)
  manta_fusions <- manta_fusions[ manta_fusions %in% unique(c(as.vector(fusions$geneA), as.vector(fusions$geneB))) ]
    
  ##### Flag fusions that were also reported in MANTA
  if ( length(manta_fusions) > 0 ) {
    fusions$geneA_dna_support[ sort( match( manta_fusions , fusions$geneA ), na.last = NA ) ] <- "Yes"
    fusions$geneB_dna_support[ sort( match( manta_fusions , fusions$geneB ), na.last = NA ) ] <- "Yes"
      
    fusion_annot$geneA_dna_support[ sort( match( manta_fusions , fusion_annot$SYMBOL ), na.last = NA ) ] <- "Yes"
    fusion_annot$geneB_dna_support[ sort( match( manta_fusions , fusion_annot$SYMBOL.1 ), na.last = NA ) ] <- "Yes"
  
    ##### Re-order fusion dataframe with MANTA supporting fusions on top
    if ( runArribaChunk ) {
      idx <- order(fusions$geneA_dna_support, fusions$geneB_dna_support, fusions$Arriba, fusions$reported_fusion, decreasing = TRUE)
    } else {
      idx <- order(fusions$geneA_dna_support, fusions$geneB_dna_support, fusions$reported_fusion, decreasing = TRUE)
    }
    
    fusions <- fusions[ idx, ]
    fusion_annot <- fusion_annot[ idx, ]
  }
}

##### Remove entries with missing annotation
fusion_annot <- fusion_annot[complete.cases(fusion_annot), ]

##### Clean the space and return output
rm(manta_fusions)
```

```{r fusions_filtering, comment = NA, message=FALSE, warning=FALSE, eval = runFusionChunk}
##### Filter out fusions that are with < 2 split reads and < 2 pair reads and are not supported by genomic data, are not reported and don't involve cancer genes

##### Dragen + Arriba
if ( runDragenFusionChunk && runArribaChunk ) {
  fusions <- fusions %>% dplyr::filter( split_reads > 1 | discordant_mates > 1 | score > 0 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  fusion_annot <- fusion_annot %>% dplyr::filter( split_reads > 1 | discordant_mates > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")

  ##### Arriba / Arriba + Pizzly
} else if ( runArribaChunk ) {
  fusions <- fusions %>% dplyr::filter( split_reads > 1 | discordant_mates > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  fusion_annot <- fusion_annot %>% dplyr::filter( split_reads > 1 | discordant_mates > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  
##### Dragen only
} else if ( runDragenFusionChunk ) {
  
  ##### For Dragen , this filtering is not changing the results. We'll review the "Score" value again once we start to regularly produce the RNAsum report for Dragen results
  #####  Dragen's fusion format version 3.9.3
  if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(fusions)) ) {
    fusions <- fusions %>% dplyr::filter( split_reads > 1 | score > 0 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
    fusion_annot <- fusion_annot %>% dplyr::filter( score > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  
  #####  Dragen's fusion format prior to version 3.9.3
  } else {
    fusions <- fusions %>% dplyr::filter( score > 0 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
    fusion_annot <- fusion_annot %>% dplyr::filter( score > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  }
  
##### Pizzly only
} else {
  fusions <- fusions %>% dplyr::filter(split_reads > 1 | discordant_mates > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
  fusion_annot <- fusion_annot %>% dplyr::filter(split_reads > 1 | discordant_mates > 1 | geneA_dna_support != "-" | geneB_dna_support != "-" | reported_fusion != "-" | fusions_cancer != "-")
}
```

```{r fusions_circos, comment = NA, message=FALSE, warning=FALSE, eval = runFusionChunk}
##### Indicate which fusions have genomic coordinates and can be presented on circos plot
##### Take into account only reported fusions or those with both genes genes supported by DNA
if ( runSVsChunk ) {
  fusion_annot_top <- fusion_annot[ fusion_annot$reported_fusion == "Yes" | fusion_annot$geneA_dna_support == "Yes" | fusion_annot$geneB_dna_support == "Yes" , ]
} else {
  fusion_annot_top <- fusion_annot[ fusion_annot$reported_fusion == "Yes" , ]
}

fusions$circos <- "-"
fusions$circos[ paste(fusions$geneA, fusions$geneB, sep="-") %in% paste(fusion_annot_top$SYMBOL, fusion_annot_top$SYMBOL.1, sep="-") ] <- "Yes"
```

```{r immunogram_table_prep, comment = NA, message=FALSE, warning=FALSE, eval = params$immunogram}
##### Extract data for Immunogram genes
data <- ref_dataset.list[[dataset]][["data_to_report"]]
data <- data[ rownames(data) %in% ref_genes.list[["genes_immune"]]$immunogram$SYMBOL, ]

##### Create lists with caulcuation results for each individual Cancer-Immunity Cycle (CIC) step
CIC.list <- vector("list", length(unique(ref_genes.list[["genes_immune"]]$immunogram$CIC)))
names(CIC.list) <- unique(ref_genes.list[["genes_immune"]]$immunogram$CIC)
  
##### Calculate average expression for each Cancer-Immunity Cycle (CIC) step
for ( cic_step in unique(ref_genes.list[["genes_immune"]]$immunogram$CIC) ) {
  
  genes <- ref_genes.list[["genes_immune"]]$immunogram$SYMBOL[ ref_genes.list[["genes_immune"]]$immunogram$CIC %in% cic_step ]
  data.sub <- data[ rownames(data) %in% genes, ]
  CIC.list[[cic_step]] <- colMeans(data.sub)
}

##### Conver the list into dataframe
ref_genes.list[["genes_immune"]]$immunogram.df <- t(data.frame(matrix(unlist(CIC.list), nrow=length(CIC.list), byrow=T),stringsAsFactors=FALSE))
colnames(ref_genes.list[["genes_immune"]]$immunogram.df) <- names(CIC.list)
rownames(ref_genes.list[["genes_immune"]]$immunogram.df) <- colnames(data.sub)
```

```{r ref_cohorts_summary, comment = NA, message=FALSE, warning=FALSE}
##### Summarise the reference cohorts samples
target <- ref_dataset.list[[dataset]][["sample_annot"]]
ref_ext_cancer <- table(target$Target)[names(table(target$Target))==ext_cancer_group]
ref_int_cancer <- table(target$Target)[names(table(target$Target))==int_cancer_group]

if ( !is.null(add_cancer_group) ) {
  ref_ext_cancer <- table(target$Target)[names(table(target$Target))==c(ext_cancer_group)] +  table(target$Target)[names(table(target$Target))==c(add_cancer_group)]
}
```

```{r goi_summary_update, comment = NA, message=FALSE, warning=FALSE}
##### Update altered genes in ...
##### ...gene fusion section: Include only those which are DNA-supported (see Structural variants section) or reported in FusionGDB 
if ( runFusionChunk ) {
  ref_genes.list[["summary"]]$Fusion <- fusions[ fusions$reported_fusion == "Yes" | fusions$geneA_dna_support == "Yes" | fusions$geneB_dna_support == "Yes" , ]
  ref_genes.list[["summary"]]$Fusion <- unique(c(as.character(ref_genes.list[["summary"]]$Fusion$geneA), as.character(ref_genes.list[["summary"]]$Fusion$geneB)))
} else {
  ref_genes.list[["summary"]]$Fusion <- NULL
}

##### ...copy-number (CN) section: include only genes with CN values > 3 or < 0.5
if ( runPurpleChunk ) {
  
  #### Keep only genes with user-define CN values
  ref_genes.list[["summary"]]$CN <- ref_dataset.list[[dataset]][["expr_mut_cn_data"]]
  ref_genes.list[["summary"]]$CN <- as.character(ref_genes.list[["summary"]]$CN[ ref_genes.list[["summary"]]$CN$CN <= cn_bottom | ref_genes.list[["summary"]]$CN$CN >= cn_top,  ]$Gene)
}

##### ...immune markers section: include only genes with available annotation
if ( params$immunogram ) {
  ref_genes.list[["summary"]]$Immune <- unique(c(ref_genes.list[["genes_immune"]]$immunogram$SYMBOL, ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL))
} else {
  ref_genes.list[["summary"]]$Immune <- unique(ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL)
}
```

```{r goi_sunburst, comment = NA, message=FALSE, warning=FALSE}
suppressMessages(library(plotly))

##### Prepare dataframe for Sunburst plot summarising all altered genes
alt_genes.all.list <- ref_genes.list[["summary"]]

##### Don't show cancer genes list (too long)
alt_genes.all.list$Cancer <- NULL

##### Note all altered genes
alt_genes.all <- sort(table(unlist(alt_genes.all.list)), decreasing = TRUE)

for ( alt in names(alt_genes.all.list) ) {

  ##### Add only alteration type which has at least one alteration detected
  if ( length(alt_genes.all[ names(alt_genes.all) %in% alt_genes.all.list[[ alt ]] ])  > 0 ) {
    alt_genes.all.list[[ alt ]] <- alt_genes.all[ names(alt_genes.all) %in% alt_genes.all.list[[ alt ]] ]
  } else {
    alt_genes.all.list[[ alt ]] <- NULL
  }
}

sunburst.all.df <- data.frame(ids = names(alt_genes.all.list),
  labels = names(alt_genes.all.list),
  parents = rep("", length(alt_genes.all.list)),
  values = as.numeric(lengths(alt_genes.all.list))/100,
  stringsAsFactors = FALSE
)

for ( alt in names(alt_genes.all.list) ) {
  
  ##### Add only alteration type which has at least one alteration detected
  if ( length(alt_genes.all[ names(alt_genes.all) %in% names(alt_genes.all.list[[ alt ]]) ])  > 0 ) {
      
    sunburst.all.df <- rbind( sunburst.all.df , data.frame(ids = paste( alt, names(alt_genes.all.list[[ alt ]]), sep = " - "), 
          labels = paste0("\t\t", names(alt_genes.all.list[[ alt ]]), "\t\t"),
          parents = rep( alt , length(alt_genes.all.list[[ alt ]])),
          values = as.numeric(alt_genes.all.list[[ alt ]])
          ) )
  }
}

sunburst_plot <- NULL
sunburst_plot[[1]] <- plot_ly(sunburst.all.df, ids = ~ids, labels = ~labels, parents = ~parents, values = ~values, type = 'sunburst', width = 600, height = 600)

##### Now include only Identify genes that appear in more then two lists
alt_genes.list <- alt_genes.all.list
alt_genes <- alt_genes.all[ alt_genes.all > 1 ]

for ( alt in names(alt_genes.list) ) {

  ##### Add only alteration type which has at least one alteration detected
  if ( length(alt_genes[ names(alt_genes) %in% names(alt_genes.list[[ alt ]]) ])  > 0 ) {
    alt_genes.list[[ alt ]] <- alt_genes[ names(alt_genes) %in% names(alt_genes.list[[ alt ]]) ]
  } else {
    alt_genes.list[[ alt ]] <- NULL
  }
}
  
sunburst.df <- data.frame(ids = names(alt_genes.list),
  labels = names(alt_genes.list),
  parents = rep("", length(alt_genes.list)),
  values = as.numeric(lengths(alt_genes.list))/100,
  stringsAsFactors = FALSE
)
  
for ( alt in names(alt_genes.list) ) {
  sunburst.df <- rbind( sunburst.df , data.frame(ids = paste( alt, names(alt_genes.list[[ alt ]]), sep = " - "), 
        labels = paste0("\t\t", names(alt_genes.list[[ alt ]]), "\t\t"),
        parents = rep( alt , length(alt_genes.list[[ alt ]])),
        values = as.numeric(alt_genes.list[[ alt ]])
        ) )
}

if ( nrow(sunburst.df) > 0 ) {
  sunburst_plot[[2]] <- plot_ly(sunburst.df, ids = ~ids, labels = ~labels, parents = ~parents, values = ~values, type = 'sunburst', width = 600, height = 600)
} else {
  sunburst_plot[[2]] <- NA
}

##### Create directory for the plots
summaryPlotsDir <- paste(results_dir, "summaryPlots", sep = "/")
if ( !file.exists(summaryPlotsDir) ) {
  dir.create(summaryPlotsDir, recursive=TRUE)
}
  
##### Save interactive plot as html file
saveWidgetFix(sunburst_plot[[1]], file = paste(summaryPlotsDir, "sunburst_plot_all.html", sep = "/"))

if ( !is.na(sunburst_plot[[2]]) ) {
  saveWidgetFix(sunburst_plot[[2]], file = paste(summaryPlotsDir, "sunburst_plot.html", sep = "/"))
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())

##### Clean the space
rm(sunburst.all.df, sunburst.df)
```

```{r goi_summary_table, comment = NA, message=FALSE, warning=FALSE}
##### Prepare dataframe for a table summarising all altered genes
##### Create lists with alterations detected in each gene
genes.list <- names(alt_genes.all)
summary_table.list <- vector("list", length(genes.list))
names(summary_table.list) <- genes.list
  
##### Go through all alterated genes and note the alterations types
for ( gene in names(alt_genes.all) ) {
  for ( alt in names(alt_genes.all.list) ) {
    if ( gene %in% names(alt_genes.all.list[[ alt ]])  ) {
      summary_table.list[[ gene ]] <- c( summary_table.list[[ gene ]], "Yes" )
    } else {
      summary_table.list[[ gene ]] <- c( summary_table.list[[ gene ]], "-" )
    }
  }
  
  ##### Add links to external resources
  ##### Provide link to VICC meta-knowledgebase ( https://search.cancervariants.org )
  summary_table.list[[ gene ]] <- c( summary_table.list[[ gene ]], paste0("<a href='https://search.cancervariants.org/#", gene, "' target='_blank'>VICC</a>"))
      
  ##### Provide link to OncoKB
  if ( gene %in% rownames(ref_genes.list[["genes_oncokb"]]) ) {
    if ( ref_genes.list[["genes_oncokb"]][  gene, "OncoKB"] == "Yes" ) {
          
      summary_table.list[[ gene ]][ length(summary_table.list[[ gene ]])] <- paste( summary_table.list[[ gene ]][ length(summary_table.list[[ gene ]])] , paste0("<a href='http://oncokb.org/#/gene/", gene, "' target='_blank'>OncoKB</a>"), sep = ", ")
    }
  }
      
  ##### Provide link to CIViC database druggable genes ( https://civicdb.org )
  if ( gene %in% caner_genes_annot.list[["civic_clin_evid"]]$gene ) {
    summary_table.list[[ gene ]][ length(summary_table.list[[ gene ]])] <- paste( summary_table.list[[ gene ]][ length(summary_table.list[[ gene ]])] , paste0("<a href='", unique(caner_genes_annot.list[["civic_clin_evid"]][ caner_genes_annot.list[["civic_clin_evid"]]$gene == gene , "gene_civic_url"]), "' target='_blank'>CIViC</a>"), sep = ", ")
  }
}

##### Convert the list into data frame
summary_table.df <- data.frame(matrix(unlist(summary_table.list), nrow=length(summary_table.list), byrow=T),stringsAsFactors=FALSE)

##### Add gene names and number of section in which individual genes are reported
summary_table.df <- cbind(names(summary_table.list), summary_table.df)
summary_table.df <- cbind(summary_table.df, as.numeric(alt_genes.all))
colnames(summary_table.df) <- c("Gene", names(alt_genes.all.list), "Resources", "Count")

##### Add GeneCards links
summary_table.df$Gene <- paste0("<a href='https://www.genecards.org/cgi-bin/carddisp.pl?gene=", summary_table.df$Gene, "' target='_blank'>", summary_table.df$Gene, "</a>")

##### Clean the space and return output
rm(summary_table.list)
```

***

<details>
<summary>Input data summary</summary>

**Reference patient cohorts**

The following reference patient cohorts were used for the analysis:

* **`r paste(ref_ext_cancer, ext_cancer_group, sep=" ")`** samples from [The Cancer Genome Atlas](https://github.com/umccr/RNAseq-Analysis-Report/blob/master/TCGA_projects_summary.md#tcga-projects-summary){target="_blank"} project ([related publication](https://www.nature.com/articles/ng.2764){target="_blank"})
* **`r paste(ref_int_cancer, int_cancer_group, sep=" ")`** samples from [University of Melbourne Centre for Cancer Research](https://research.unimelb.edu.au/centre-for-cancer-research/home){target="_blank"} samples collection

**Input genes**

Out of the `r nrow(ref_dataset.list[[dataset]][["combined_data"]])` input genes **`r nrow(data.annot)`** are used for analyses:

* **`r if (params$filter) { nrow(data.annot) - nrow(genes2keep[ !genes2keep$EXP, ]) } else { ("0") }`** have reliably detected expression
* **`r if (params$filter) { nrow(genes2keep[ !genes2keep$EXP, ]) } else { ("0") }`** are not expressed but are of interest and are included in analyses
* `r if (params$filter) { nrow(ref_dataset.list[[dataset]][["combined_data"]]) - nrow(ref_dataset.list[[dataset]][["data_to_report"]]) } else { ("0") }` are either not expressed or their expression level is too low to be detected
* `r if (params$filter) { nrow(ref_dataset.list[[dataset]][["data_to_report"]]) - nrow(data.annot) } else { nrow(ref_dataset.list[[dataset]][["combined_data"]]) - nrow(data.annot) }` genes were ignored due to lack of [HGNC](https://www.genenames.org/){target="_blank"}-approved gene symbol

NOTE, the `r if (params$filter) { nrow(ref_dataset.list[[dataset]][["combined_data"]]) - nrow(ref_dataset.list[[dataset]][["data_to_report"]]) + nrow(genes2keep[ !genes2keep$EXP, ]) } else { ("0") }` genes with no/low expression are indicated in <span style="color:#808080">BLANK</span> cells with missing values in *expression summary tables* in [Mutated genes], [Structural variants], [CN altered genes], [Immune markers], [HRD genes] and [Cancer genes] sections.


**Library size**

Bar-plot illustrating library size for each sample.

```{r library_size, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3}
library_size
```

**Data filtering and transformation**

The read count data were converted into **`r params$transform`**s using *[edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html){target="_blank"}* functions. `r if ( !params$filter ) { c("The option for filtering out genes with low counts is switched OFF") } else if ( params$filter ) { c("Genes with low counts were filtered out") }`. `r if ( !params$log ) { c("The data were not log-transformed") } else if ( params$log ) { c("The data were log2-transformed") }`. 

`r if ( params$transform == "CPM" ) { paste0("The CPM of 1 (cut-off for removing low expressed genes) corresponds to ", cpm.min, " reads in sample with the lowest sequencing depth, and ", cpm.max, " reads in sample with the greatest sequencing depth. The plot below presents the relation between read counts and the corresponding ", params$transform, " values in the patient data. The red vertical line indicates the threshold for filtering genes with low counts.") } else { paste0("The plot below presents the relation between read counts and the corresponding ", params$transform, " values in the patient data. The red vertical line indicates the threshold for filtering genes with low counts.") }`

```{r counts_vs_transformed, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = params$filter}
counts_vs_transformed
```


Plot(s) below present `r params$transform` data distribution `r if ( params$filter ) { c(" before and after filtering genes with low counts.") } else { cat(".")}`
 
```{r data_transformation_display, fig.width = 12, fig.height = 5, thumb = list(width = 15, height = 15) }
data_transformation_nonfiltered

if ( params$filter ) {
  data_transformation_filtered
}
```

**Data normalisation**

During the sample preparation or sequencing process, external factors that are not of biological interest can affect the expression of individual samples. It is assumed that all samples should have a similar range and distribution of expression values. Normalisation for sample-specific effects is required to ensure that the expression distributions of each sample are similar across the entire experiment. Normalisation is performed using **`r params$norm`** method.

Box-plots below present `r params$transform` data for individual samples, coloured by sample groups, before and after `r params$norm` normalisation.

```{r data_normalisation_display, fig.width = 12, fig.height = 9, thumb = list(width = 15, height = 15) }
data_nonnormalised

if ( params$norm != "none" ) {
  data_normalised
}
```

**Exploratory data analysis**

`r if ( params$batch_rm ) { paste0("The expression data produced by different studies are confounded by non-biological experimental variances that prevent direct comparison of samples from different studies. In order to minimise the variance caused by confounding factors [limma removeBatchEffect](http://web.mit.edu/~r/current/arch/i386_linux26/lib/R/library/limma/html/removeBatchEffect.html){target=\"_blank\"} method was used to adjust expression measurements for potential batch effects. In brief, the strategy is to consider the investigated sample and the ", paste(ref_int_cancer, int_cancer_group, sep=" "), " samples as one batch (regardless of the investigated sample tissue origin) and ", paste(ref_ext_cancer, ext_cancer_group, sep=" "), " samples (of any cancer type) as another batch. The objective is to remove as much data variation due to technical factors as possible.") }`

Principal component analysis (PCA) was performed to reduce the dimensionality of data to visually assess similarities and differences between samples. This exploratory analysis facilitates identification of the key factors affecting the variability in the expression data.

* **PCA plot**

`r if ( params$batch_rm ) { paste0("Scatter-plots of the first 2 principal components (PCs) constituting the primary source of variation in the data before and after batch effects correction.") } else { paste0("Scatter-plot of the first 2 principal components (PCs) constituting the primary source of variation in the data.") }`

```{r pca_combined_data_display, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 8 }
ref_dataset.list[[dataset]][["pca_combined_data_processed"]][[2]]

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

```{r pca_batch_effect_corrected_display, comment = NA, message=FALSE, warning=FALSE, eval=params$batch_rm, fig.width = 8, fig.height = 8 }
ref_dataset.list[[dataset]][["pca_batch_effect_corrected"]][[2]]

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

* **Scree-plot**

`r if ( params$batch_rm ) { paste0("Scree-plots presenting the fraction of total variance (*y-axis*) attributed to each PC (*x-axis*) before and after batch effects correction. The PCs are ordered by decreasing order of contribution to total variance.") } else { paste0("Scree-plot presenting the fraction of total variance (*y-axis*) attributed to each PC (*x-axis*). The PCs are ordered by decreasing order of contribution to total variance. ") }`

```{r scree_combined_data_display, comment = NA, message=FALSE, warning=FALSE, fig.width = 12, fig.height = 5 }
ref_dataset.list[[dataset]][["pca_combined_data_processed"]][[3]]

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

`r if ( params$batch_rm ) { cat("* After batch-effects correction") } else { cat(" ") }`

```{r scree_batch_effect_corrected_display, comment = NA, message=FALSE, warning=FALSE, eval=params$batch_rm, fig.width = 12, fig.height = 5 }
ref_dataset.list[[dataset]][["pca_batch_effect_corrected"]][[3]]

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

* **RLE plot**

The relative log expression (RLE) plot is a useful diagnostic tool to visualise the differences between the distributions of read counts across samples. It shows boxplot of the log-ratios of the gene-level read counts (*y-axis*) of each sample to those of a reference sample (defined as the median across the samples). Ideally, the distributions should be centered around the zero line and as tight as possible. Clear deviations indicate the need for normalisation and/or the presence of outlying samples.

```{r rle_display, comment = NA, message=FALSE, warning=FALSE, fig.width = 12, fig.height = 9, thumb = list(width = 15, height = 15)}
ref_dataset.list[[dataset]][["rle_combined_data_processed"]]

if ( params$norm != "none" ) {
  ref_dataset.list[[dataset]][["rle_batch_effect_corrected"]]
}
```

</details>

`r if ( runClinicalChunk ) { c("***") }`

`r if ( runClinicalChunk ) { c("## Clinical information") }`

`r if ( runClinicalChunk ) { c("#### Treatment timeline") }`

`r if ( runClinicalChunk ) { c("NOTE: for confidentiality reasons, the start of the timeline (*x-axis*) projecting patient's treatment regimens (*y-axis*) is set to 1st January 2000, but the treatments lengths are preserved.") }`

```{r treatment_timeline_plot, comment = NA, message=FALSE, warning=FALSE, fig.width = 12, fig.height = 5, thumb = list(width = 15, height = 15), eval = runClinicalChunk }
##### Present the treatment timeline plot
treatment_timeline
```

***

## Findings summary {.tabset}

### Per-alteration plot

`r if ( !is.na(sunburst_plot[[2]]) ) { c("**Genes** listed **in at least two sections** of this report") } else { c("**All altered genes**")  }` are summarised in the plot below. `r if ( !is.na(sunburst_plot[[2]]) ) { c("These genes may be of particular interest given that the evidence for their alteration is derived from multiple sources.")  }` The number next to each gene indicates the number of times it appears across the following report sections: `r if ( runPcgrChunk ) { c("[Mutated genes], ") }` `r if ( runFusionChunk ) { c("[Fusion genes] (supported by genomic data or reported in FusionGDB), ") }` `r if ( runSVsChunk ) { c("[Structural variants], ") }` `r if ( runPurpleChunk ) { paste0("[CN altered genes] (CN values =< ", cn_bottom, " or >= ", cn_top, " and reported as cancer genes)") }` [Immune markers] or [HRD genes]. That number is also reflected by the *width* of corresponding branches. Click on the category of interest to expand corresponding branches. Genes within each category are ordered by the number of report sections in which they appear and then alphabetically.

`r if ( is.na(sunburst_plot[[2]]) ) { c("<span style=\"color:#ff0000\">NOTE</span>, no genes are listed in more then one section.") }`
  
```{r findings_summary_plot, comment = NA, message=TRUE, warning=FALSE, fig.width = 8, fig.height = 6 }
##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, "Findings summary")
mysql_populate_update <- paste0(mysql_populate_update, "Findings summary")

##### Present per-alteration findings summary sunburst plot for all altered genes
if ( !is.na(sunburst_plot[[2]]) ) {
  sunburst_plot[[2]]
} else {
  sunburst_plot[[1]]
}

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

`r if ( !is.na(sunburst_plot[[2]]) ) { c("<details>\n<summary>Show all altered genes</summary>") }`

```{r findings_summary_all_plot, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 6 }
##### Present per-alteration findings summary sunburst plot for altered genes listed in at least two report sections
if ( !is.na(sunburst_plot[[2]]) ) {
  sunburst_plot[[1]]
}

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

`r if ( !is.na(sunburst_plot[[2]]) ) { c("</details>") }`

```{r clear_space, comment = NA, message=FALSE, warning=FALSE }
rm(counts_vs_transformed, data_transformation_nonfiltered, data_transformation_filtered, data_nonnormalised, data_normalised, treatment_timeline, data.annot)
```

***

### Per-gene table

Table summarising **all altered genes** listed across following report sections: `r if ( runPcgrChunk ) { c("[Mutated genes], ") }` `r if ( runFusionChunk ) { c("[Fusion genes] (supported by genomic data or reported in FusionGDB), ") }` `r if ( runSVsChunk ) { c("[Structural variants], ") }` `r if ( runPurpleChunk ) { paste0("[CN altered genes] (CN values =< ", cn_bottom, " or >= ", cn_top, " and reported as cancer genes)") }` [Immune markers] or [HRD genes]. The *Resources* column contains links to databases that may provide additional source of evidence for the altered genes' clinical significance. Genes ordered by the number of report sections they appear in (*Count* column) and then alphabetically.

`r if ( is.na(sunburst_plot[[2]]) ) { c("<span style=\"color:#ff0000\">NOTE</span>, no genes are listed in more then one section.") }`

```{r findings_summary_table, comment = NA, message=FALSE, warning=FALSE }
##### Present per-gene findings summary table
findings.summary <- DT::datatable( data = summary_table.df, filter="none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrltip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = "333px", scroller = TRUE), width = 800, height = 455, caption = htmltools::tags$caption( style = 'caption-side: top; text-align: left; color:grey; font-size:100%'), escape = FALSE) %>%
      DT::formatStyle( columns = colnames(summary_table.df), `font-size` = '12px', 'text-align' = 'center' ) %>%
      ##### Colour cells according to evidence level and trust rating
      DT::formatStyle(columns = colnames(summary_table.df)[c(2:ncol(summary_table.df)-2)], 
                      backgroundColor = DT::styleEqual(c("-", "Yes"), c("transparent", "black")), color = DT::styleEqual(c("-", "Yes"), c("black", "white")))

findings.summary

##### Create directory for tables
summaryTableDir <- paste(results_dir, "summaryTables", sep = "/")
if ( !file.exists(summaryTableDir) ) {
    dir.create(summaryTableDir, recursive=TRUE)
}

saveWidgetFix(widget=findings.summary, file=paste(summaryTableDir, "findings.summary.html", sep = "/"), selfcontained=TRUE)

##### Clean the space
rm(summary_table.df, findings.summary, sunburst_plot)
```

***

## Mutated genes

mRNA expression levels of genes containing single nucleotide variants (SNVs) or insertions/deletions (indels), obtained from the [PCGR](https://github.com/sigven/pcgr){target="_blank"} report, in patient's sample and their average mRNA expression in samples from cancer cohorts. NOTE, only PCGR [tier](https://pcgr.readthedocs.io/en/latest/tier_systems.html#tier-model-2-pcgr-acmg){target="_blank"} 1-`r params$pcgr_tier` `r if ( params$pcgr_splice_vars ) { c("and non-coding splice region " ) }` variants are reported.

`r if ( !runPcgrChunk ) { c("Mutation data for this sample is **NOT AVAILABLE**.") }` 

### - Summary table {.tabset}

Out of the `r length(unique(ref_genes.list[["pcgr"]]$SYMBOL))` mutated genes `r length(unique(ref_genes.list[["pcgr"]][ ref_genes.list[["pcgr"]]$TIER %in% c(1:params$pcgr_tier), ]$SYMBOL))` include [tier](https://pcgr.readthedocs.io/en/latest/tier_systems.html#tier-model-2-pcgr-acmg){target="_blank"} 1-`r params$pcgr_tier` variants `r if ( params$pcgr_splice_vars ) { paste0("and ", length(ref_genes.list[["summary"]]$Mutated) - length(unique(ref_genes.list[["pcgr"]][ ref_genes.list[["pcgr"]]$TIER %in% c(1:params$pcgr_tier), ]$SYMBOL)), " non-coding splice region variant" ) }`. Of these, the expression of **`r length(which(ref_genes.list[["summary"]]$Mutated %in% rownames(ref_dataset.list[[dataset]][["data_to_report"]])))`** was reliably measured in patient's sample. The remaining `r length(which(ref_genes.list[["summary"]]$Mutated %!in% rownames(ref_dataset.list[[dataset]][["data_to_report"]])))` genes are either not expressed or their expression level is too low to be detected (indicated in <span style="color:#808080">BLANK</span> cells with missing values).

#### Percentiles

```{r mut_genes_table_perc, comment = NA, message=FALSE, warning=FALSE, eval = runPcgrChunk}
##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",Mutated genes")
mysql_populate_update <- paste0(mysql_populate_update, ",Mutated genes")

##### Generate expression summary table for mutated genes (based on PCGR report)
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]

##### Consider only genes with mutations calssified within user-defined tiers
genes <- ref_genes.list[["summary"]]$Mutated

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes) == 0 ) {
  genes <- NULL
  limit_genes <- FALSE
  genes_no <- 0
} else if ( length(genes) > params$top_genes ) {
  limit_genes <- TRUE
  genes_no <- params$top_genes
} else {
  limit_genes <- FALSE
  genes_no <- length(genes)
}

mut_genes.expr.perc <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)

##### Present the expression summary table
mut_genes.expr.perc[[1]]

##### Save the expression table as html file
##### Create directory for tables
if ( params$save_tables ) {
  saveWidgetFix(widget=mut_genes.expr.perc[[1]], file=paste(exprTableDir, "mut_genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
}
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (percentile) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (percentile) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each mutated gene. Variants' tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided based on information from [PCGR](https://github.com/sigven/pcgr){target="_blank"} report. In case of multiple varaints detected in single gene the variant with the lowest tier is reported and other potential consequences are listed in column *CONSEQUENCE_OTHER*. Genes are ordered by **increasing variants TIER** and then by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column.

</font>
</details>

`r if ( runPcgrChunk && length(genes) > 2000 ) { c(paste0("<span style=\"color:#ff0000\">NOTE</span>, the table was truncated to 2000 entries.")) } else { cat("") }`

***

#### Z-scores

```{r mut_genes_table, comment = NA, message=FALSE, warning=FALSE, eval = runPcgrChunk, results = "asis"}
mut_genes.expr.z <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "z", scaling = scaling)[[1]]

##### Present the expression summary table
mut_genes.expr.z

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=mut_genes.expr.z, file=paste(exprTableDir, "mut_genes.expr.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(mut_genes.expr.z)
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (Z-score) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (Z-score) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each mutated gene. Variants' tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided based on information from [PCGR](https://github.com/sigven/pcgr){target="_blank"} report. In case of multiple varaints detected in single gene the variant with the lowest tier is reported and other potential consequences are listed in column *CONSEQUENCE_OTHER*. Genes are ordered by **increasing variants TIER** and then by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column.

</font>
</details>

`r if ( runPcgrChunk && length(genes) > 2000 ) { c(paste0("<span style=\"color:#ff0000\">NOTE</span>, the table was truncated to 2000 entries.")) } else { cat("") }`

***

### - Expression profiles {.tabset}

`r if ( exists("limit_genes") ) { if ( limit_genes ) { c(paste0("Expression profiles for ", genes_no, " mutated genes with variants annotated with the lowest [tier](https://pcgr.readthedocs.io/en/latest/tier_systems.html#tier-model-2-pcgr-acmg){target=\"_blank\"} and demonstrating the greatest difference in mRNA expression (percentile) values between patient's sample and the average mRNA expression in samples from cancer patients.")) } else { cat(" ") }}`

```{r cdf_plots_mut, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runPcgrChunk, results="asis"}
suppressMessages(library(plotly))
##### Generate empirical cumulative distribution function (ECDF) plot illustrating mRNA expression level for the genes of interest in the context of the overall mRNA expression distribution
output_cdf <- list()
output_counts <- list()
output_density <- list()
genes <- mut_genes.expr.perc[[2]]$SYMBOL

##### For each gene generate (1) CDF plot and add boxplot below to show the data variance for selected gene in individual groups, (2) bar-plot of read count data across all samples and (3) density plot to demonstrate expression distribution in investigated sample 
for( i in 1:genes_no ) {
  if ( genes_no > 0 && genes[i] %in% rownames(data) ) {
    
    ##### CDF plot
    output_cdf[[i]] <- cdfPlot(gene = genes[i], data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, addBoxPlot = TRUE, scaling = scaling, report_dir = results_dir)
    
    ##### Bar-plot of read counts
    ##### First map the gene symbol to Ensmebl ID (used in the counts data)
    genes.ENSEMBL <- ref_dataset.list[[dataset]][["gene_annot_all"]]$ENSEMBL[ ref_dataset.list[[dataset]][["gene_annot_all"]]$SYMBOL ==  genes[i] ]
    
    output_counts[[i]] <- barPlot(gene = genes.ENSEMBL, data = ref_dataset.list[[dataset]][["combined_data"]], y_title = "Counts", targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group )
    
    ##### Density plot - expression distribution
    output_density[[i]] <- densityPlot(gene = genes[i], data = data, main_title= "", x_title = "Z-score", sampleName = sample_name, distributions = c("normal", "bimodal"), scaling = scaling) 
  }
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
}

##### Now once the plots are ready show them in separate tabs
if ( genes_no != 0 ) {
  for( i in 1:genes_no ){
    if ( genes[i] %in% rownames(data) ) {
      cat("\n#### ", genes[i], "\n")
      cat(renderTags(output_cdf[[i]])$html)
      cat("\n<details>\n")
      cat("\n<summary>Plot legend</summary>\n")
      cat("<font size=\"2\">\n")
      cat(paste0("**Top panel**: distribution of percentile values (*y-axis*) as a function of expression levels (Z-scores, *x-axis*) for *", genes[i], "* in patient's sample (*black dot*) and other reference cancer cohort(s) (median value(s)).\n\n"))
      cat(paste0("**Bottom panel**: box-plot presenting expression level (Z-score) of *", genes[i], "* in patient's sample (*black dot*) and its expression levels observed across samples from other reference cancer cohort(s).\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Read counts</summary>\n")
      cat(renderTags(output_counts[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Bar-plot illustrating read counts for *", genes[i], "* across all samples. The *", genes[i], "* read count in patient's sample is indicated by *black bar*.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Expression distribution patterns</summary>\n")
      cat(renderTags(output_density[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Plot illustrating distribution of expression levels (Z-scores) of *", genes[i], "* *observed* across all samples along with simulated *normal* and *bimodal* distributions. The *", genes[i], "* expression level observed in patient's sample is indicated by *black dot* in each distribution.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n</details>\n")
      cat("\n***\n")
    } else {
      cat("\n#### ", genes[i], "\n")
      cat("\n<span style=\"color:#ff0000\">NOTE</span>, expression data is not available for that gene.\n")
      cat("\n***\n")
    }
  }
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
  
} else {
  cat("\nNo alterations were reported.\n")
   cat("\n***\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

##### Clean the space
rm(list = ls(pattern='^output*'))
rm(limit_genes)
```

`r if ( !runPcgrChunk ) { c("***") } else { c(" ") }`

## Fusion genes

<details>
<summary>Fusion genes prioritisation</summary>

Fusion genes detected in transcriptome data are prioritised in the following order:

1. Involvement of fusion gene(s) **detected in genomic data** (if [Structural variants] results are available)

2. **Detected in transcriptome data** by [Arriba](https://arriba.readthedocs.io/en/latest/){target="_blank"} tool

3. **Reported** fusion event according to [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"} database

4. Decreasing number of **split reads**

5. Decreasing number of **pair reads**

6. Involvement of **cancer gene(s)** (see [Cancer genes] section)

</details>

<details>
<summary>Fusion genes filtering</summary>

Fusion genes detected in transcriptome data are reported if **at least one** of the following criteria is met:

1. Involvement of fusion gene(s) **detected in genomic data** (if [Structural variants] results are available)

2. **Reported** fusion event according to [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"} database

3. Involvement of **cancer gene(s)** (see [Cancer genes] section)

4. **Split reads** > 1

5. **Pair reads** > 1 and **split reads** > 1

</details>

`r if ( !runFusionChunk ) { c("Fusion genes information for this sample is **NOT AVAILABLE**.") }`

### - Summary

Out of the `r if ( runFusionChunk ) { nrow(fusions) } else { c("0") }` fusion event(s) <span style="color:#ff0000">**`r if ( runFusionChunk ) { nrow(fusions[ fusions$geneA_dna_support == "Yes" | fusions$geneB_dna_support == "Yes" , ]) } else { c("0") }`**</span> involve **DNA-supported** fusion genes (see [Structural variants] section), <span style="color:#02d653">**`r if ( runFusionChunk ) { nrow(fusions[ fusions$reported_fusion == "Yes" , ]) } else { c("0") }`**</span> are **reported in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"}** and <span style="color:#767689">**`r if ( runFusionChunk ) { nrow(fusions[ fusions$fusions_cancer == "Yes" , ]) } else { c("0") }`**</span> involve **[Cancer genes]**.

**`r if ( !runFusionChunk ) { c("No fusion genes were detected!") }`**

```{r fusions_summary_table, comment = NA, message=FALSE, warning=FALSE}
##### Create a nice table output (with dataTable)
if ( runFusionChunk ) {
  
  ##### Update MySQL commend to populate RNA-seq data portal
  mysql_populate <- paste0(mysql_populate, ",Fusion genes")
  mysql_populate_update <- paste0(mysql_populate_update, ",Fusion genes")
  
  fusions.table <- fusions
  fusions.table$geneA <- as.vector(fusions.table$geneA)
  fusions.table$geneB <- as.vector(fusions.table$geneB)
  
  ##### Provide link to FusionGDB
  for ( i in 1:nrow(fusions.table) ) {
      if ( fusions.table$reported_fusion[i] == "Yes" ) {
        fusions.table$geneA[i] <- paste0("<a href='https://ccsm.uth.edu/FusionGDB/gene_search_result.cgi?page=page&type=quick_search&quick_search=", fusions.table$FGID[i], "' target='_blank'>", fusions.table$geneA[i], "</a>")
  
        fusions.table$geneB[i] <- paste0("<a href='https://ccsm.uth.edu/FusionGDB/gene_search_result.cgi?page=page&type=quick_search&quick_search=", fusions.table$FGID[i], "' target='_blank'>", fusions.table$geneB[i], "</a>")
      }
  }
  
  ##### Dragen + Arriba
  if ( runDragenFusionChunk && runArribaChunk ) {
    fusions.table <- fusions.table[ , names(fusions.table) %!in% "FGID" ]
    fusions.table <- fusions.table[ , c("geneA", "geneB", "split_reads", "split_readsA", "split_readsB", "discordant_mates", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB", "confidence", "score", "breakpointA", "breakpointB", "siteA", "siteB", "type", "circos")]
  names(fusions.table) <- c("Gene A", "Gene B", "Split reads (Total)", "Split reads (A)", "Split reads (B)", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Confidence (Arriba)", "Score (Dragen)", "Breakpoint (A)", "Breakpoint (B)", "Site (A)", "Site (B)", "Type", "Genomic view")
  
  ##### Arriba / Arriba + Pizzly
  } else if ( runArribaChunk ) {
    fusions.table <- fusions.table[ , names(fusions.table) %!in% "FGID" ]
    fusions.table <- fusions.table[ , c("geneA", "geneB", "split_reads", "split_readsA", "split_readsB", "discordant_mates", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB", "confidence", "breakpointA", "breakpointB", "siteA", "siteB", "type", "circos")]
  names(fusions.table) <- c("Gene A", "Gene B", "Split reads (Total)", "Split reads (A)", "Split reads (B)", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Confidence (Arriba)", "Breakpoint (A)", "Breakpoint (B)", "Site (A)", "Site (B)", "Type", "Genomic view")
  
  ##### Dragen only
  } else if ( runDragenFusionChunk ) {
    
    #####  Dragen's fusion format version 3.9.3
    if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {
        fusions.table <- fusions.table[ , names(fusions.table) %!in% "FGID" ]
        fusions.table <- fusions.table[ , c("geneA", "geneB", "split_reads", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB", "score", "breakpointA", "breakpointB", "siteA", "siteB", "circos")]
      names(fusions.table) <- c("Gene A", "Gene B", "Split reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Score", "Breakpoint (A)", "Breakpoint (B)", "Site (A)", "Site (B)", "Genomic view")
      
    #####  Dragen's fusion format prior to version 3.9.3
    } else {
      fusions.table <- fusions.table[ , names(fusions.table) %!in% "FGID" ]
        fusions.table <- fusions.table[ , c("geneA", "geneB", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB", "score", "breakpointA", "breakpointB", "circos")]
      names(fusions.table) <- c("Gene A", "Gene B", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Score", "Breakpoint (A)", "Breakpoint (B)", "Genomic view")
    }
  
  ##### Pizzly only
  } else {
    fusions.table <- fusions.table[ , names(fusions.table) %!in% "FGID" ]
    fusions.table <- fusions.table[ , c("geneA", "geneB", "split_reads", "discordant_mates", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer", "reported_fusion_geneA", "reported_fusion_geneB", "circos")]
  names(fusions.table) <- c("Gene A", "Gene B", "Split reads", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Genomic view")
  }
  
  ##### Present gene fusion events in a table
  fusions.summary <- DT::datatable( data = fusions.table, filter = "none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = "333px", scroller = TRUE), width = 800, height = 490, caption = htmltools::tags$caption(style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) %>%
      DT::formatStyle( columns = names(fusions.table), `font-size` = '12px', 'text-align' = 'center' ) %>%
    
      ##### Highlight rows with fusions involving cancer genes (grey) or DNA support (from MANTA, orange)
      DT::formatStyle( columns = colnames(fusions.table) %in% "Cancer gene(s)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'lightgrey')) ) %>%
      DT::formatStyle( columns = colnames(fusions.table) %in% "DNA support (A)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'coral')) ) %>%
      DT::formatStyle( columns = colnames(fusions.table) %in% "DNA support (B)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'coral')) ) %>%
      DT::formatStyle( columns = colnames(fusions.table) %in% "Reported fusion", backgroundColor = DT::styleEqual( c("-", "Yes"), c('transparent', 'lightgreen')) )
  
  fusions.summary
  
} else {
  
  ##### Create empty table
  fusions.table <- data.frame(matrix(ncol = 18, nrow = 0))
  
  names(fusions.table) <- c("Gene A", "Gene B", "Split reads", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)", "Fusion gene (A)", "Fusion gene (B)", "Breakpoint (A)", "Breakpoint (B)", "Genomic view")

  ##### Present gene fusion events in a table
  fusions.summary <- DT::datatable( data = fusions.table, filter = "none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = "333px", scroller = TRUE), width = 800, height = 490, caption = htmltools::tags$caption(style = 'caption-side: top; text-align: left; color:grey; font-size:100% ;'), escape = FALSE) %>%
      DT::formatStyle( columns = names(fusions.table), `font-size` = '12px', 'text-align' = 'center' )
  
  fusions.summary
}

##### Save the table as html file
if ( params$save_tables ) {
  
  ##### Create directory for tables
  fusionsTableDir <- paste(results_dir, "fusionsTables", sep = "/")
  if ( !file.exists(fusionsTableDir) ) {
          dir.create(fusionsTableDir, recursive=TRUE)
  }

  saveWidgetFix(widget=fusions.summary, file=paste(fusionsTableDir, "fusions.summary.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space and return output
rm(fusions.table, fusions.summary)
```

<details>
<summary>Table legend</summary>
<font size="2">

Cells in <span style="color:#ff0000">RED</span> indicate **DNA-supported** fusion genes (see [Structural variants] section), cells in <span style="color:#02d653">GREEN</span> indicate fusion events **reported in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"}**, and cells in <span style="color:#767689">GREY</span> indicate fusions containing **[Cancer genes]**. Gene fusions reported in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"} are hyperlinked. Genes known to be involved in gene fusions are flagged based on information provided in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"} and [Cancer Genome Interpreter](https://www.cancergenomeinterpreter.org/biomarkers){target="_blank"} (CGI) databases. *Breakpoint (A/B)* - genomic coordinates of the breakpoints in gene A/B; *Site (A/B)* - location of the breakpoints in gene A/B; *Type* - type of event based on the orientation of the supporting reads and the coordinates of breakpoints

Fusion events are ordered by the following columns:

**DNA support (A/B)**: DNA-supported fusion gene(s) (see [Structural variants] section)

**Confidence** level from [Arriba](https://arriba.readthedocs.io/en/latest/){target="_blank"} tool

**Reported fusion**: fusion event reported in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"}

**Split count**: the number of supporting split reads

**Pair count**: the number of supporting pair reads

**Cancer gene(s)**: gene fusion events involving [Cancer genes]

**Fusion gene (A/B)**: gene(s) known to be involved in tumorigenesis across cancer types based on [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"} and [CGI](https://www.cancergenomeinterpreter.org/biomarkers){target="_blank"} databases

</font>
</details>

***

### - Genomic view

<span style="color:#ff0000">**`r if ( runFusionChunk ) { nrow(fusions[ (fusions$geneA_dna_support == "Yes" | fusions$geneB_dna_support == "Yes") & fusions$circos == "Yes", ]) } else { c("0") }`**</span> **DNA-supported** fusion genes (see [Structural variants] section) and <span style="color:#02d653">**`r if ( runFusionChunk ) { nrow(fusions[ fusions$reported_fusion == "Yes" & fusions$circos == "Yes", ]) } else { c("0") }`**</span> fusions events **reported in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"}** are presented in the genomic context. *<span style="color:#ff0000">Red</span>* colour is used for links between positions of same chromosomes and *<span style="color:#0000ff">blue</span>* for links between different chromosomes. The table at the bottom contains genomic coordingates of individual fusion genes sorted based on their genomic location.

<span style="color:#ff0000">NOTE</span>: **`r if ( runFusionChunk ) { nrow(fusions[ (fusions$geneA_dna_support == "Yes" | fusions$geneB_dna_support == "Yes" | fusions$geneA_dna_support == "Yes") & fusions$circos != "Yes", ]) } else { c("0") }`** of such fusions do not have genomic information available and are not presented on the *circos plot* (see *Genomic view* column in the [- Summary] table). 

`r if ( !runSVsChunk ) { c("Genomic data for this sample is **NOT AVAILABLE**.") }`

```{r circos_prep, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 8, eval = runFusionChunk}
##### Keep only reported fusions or those with or cancer gene(s) involved
if ( runSVsChunk ) {
  fusion_annot_top <- fusion_annot[ fusion_annot$reported_fusion == "Yes" | fusion_annot$geneA_dna_support == "Yes" | fusion_annot$geneB_dna_support == "Yes" , ]
} else {
  fusion_annot_top <- fusion_annot[ fusion_annot$reported_fusion == "Yes" , ]
}

if ( nrow(fusion_annot_top) > 0 ) {
  
  ##### Create folder for fusion plots
  fusionsPlotDir <- paste(results_dir, "fusionsPlot", sep = "/")
    
  if ( !file.exists(fusionsPlotDir) ) {
    dir.create(fusionsPlotDir, recursive=TRUE)
  }
  
  ##### Prepare object for RCircos
  eval(parse( text=paste0("data(UCSC.HG", params$ucsc_genome_assembly, ".Human.CytoBandIdeogram)")))
  cyto.info <- eval(parse( text=paste0("UCSC.HG", params$ucsc_genome_assembly, ".Human.CytoBandIdeogram")))
    
  ##### Check if all driver genes are located in standard chromosomes
  fusion_annot_top <- fusion_annot_top[ paste0("chr", fusion_annot_top$SEQNAME) %in% cyto.info$Chromosome,  ]
  
  fusion_annot_top.circos.pairs <- fusion_annot_top[, c("SEQNAME", "GENESEQSTART", "GENESEQEND", "SYMBOL","SEQNAME.1", "GENESEQSTART.1", "GENESEQEND.1", "SYMBOL.1")]
  
  ##### Add "chr" to chromosome numbers
  fusion_annot_top.circos.pairs$SEQNAME <- paste0("chr", fusion_annot_top.circos.pairs$SEQNAME)
  fusion_annot_top.circos.pairs$SEQNAME.1 <- paste0("chr", fusion_annot_top.circos.pairs$SEQNAME.1)
  
  ##### Change column names
  names(fusion_annot_top.circos.pairs) <- gsub("SEQNAME", "Chromosome", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("GENESEQSTART", "chromStart", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("GENESEQEND", "chromEnd", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("SYMBOL", "Gene", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("Chromosome.1", "Chromosome", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("chromStart.1", "chromStart", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("chromEnd.1", "chromEnd", names(fusion_annot_top.circos.pairs))
  names(fusion_annot_top.circos.pairs) <- gsub("Gene.1", "Gene", names(fusion_annot_top.circos.pairs))
  
  ##### Remove entries with missing genomic coordinates
  fusion_annot_top.circos.pairs <- fusion_annot_top.circos.pairs[complete.cases(fusion_annot_top.circos.pairs), ]
  fusion_annot_top.circos <- rbind(fusion_annot_top.circos.pairs[, 1:4 ], fusion_annot_top.circos.pairs[, 5:8 ])
  fusion_annot_top.circos.pairs <- fusion_annot_top.circos.pairs[, colnames(fusion_annot_top.circos.pairs) %!in% c("Gene", "Gene.1") ]
  
  ##### Generate circos plot
  RCircos.Set.Core.Components( cyto.info=cyto.info, chr.exclude=NULL, tracks.inside=4, tracks.outside=0 )
  RCircos.Set.Plot.Area()  
  RCircos.Chromosome.Ideogram.Plot()
  RCircos.Gene.Connector.Plot(genomic.data = fusion_annot_top.circos, track.num = 1, side="in") 
  RCircos.Gene.Name.Plot(gene.data = fusion_annot_top.circos, name.col = 4, track.num = 2, side = "in")
  RCircos.Link.Plot(link.data = fusion_annot_top.circos.pairs, track.num=4, by.chromosome=TRUE, is.sorted=FALSE, lineWidth=rep(2, nrow(fusion_annot_top.circos.pairs)))
}
```

```{r fusions_genomic_view_circos_save, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runFusionChunk}
##### Generate circos plot representing gene fusion events. NOTE. Only fusions involving fusion genes supported by MANTA or reported fusions are presented
if ( nrow(fusion_annot_top) > 0 ) {
  
  ##### Save circos into a png file
  png( filename = paste(fusionsPlotDir, "circosPlot.png", sep="/"), width = 800, height = 800, units = "px", pointsize = 24 )
  RCircos.Set.Core.Components( cyto.info=cyto.info, chr.exclude=NULL, tracks.inside=4, tracks.outside=0 )
  RCircos.Set.Plot.Area()  
  RCircos.Chromosome.Ideogram.Plot()
  RCircos.Gene.Connector.Plot(genomic.data = fusion_annot_top.circos, track.num = 1, side="in") 
  RCircos.Gene.Name.Plot(gene.data = fusion_annot_top.circos, name.col = 4, track.num = 2, side = "in")
  RCircos.Link.Plot(link.data = fusion_annot_top.circos.pairs, track.num=4, by.chromosome=TRUE, is.sorted=FALSE, lineWidth=rep(2, nrow(fusion_annot_top.circos.pairs)))
  invisible(dev.off())
    
  ##### Clean the space
  rm(fusion_annot_top.circos, fusion_annot_top.circos.pairs)
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
  
} else {
  cat("None of the transcriptome-based fusion events have supporting evidence from DNA data or was previously reported.")
}
```

```{r genomic_view_circos_table_fusions, comment = NA, message=FALSE, warning=FALSE, eval = runFusionChunk}
if ( nrow(fusion_annot_top) > 0 ) {
  
  ##### Clean the table for better presentation
  ##### Dragen + Arriba / Pizzly + Arriba
  if ( runDragenFusionChunk && runArribaChunk ) {
    fusion_annot_top.clean <- fusion_annot_top[, c("SYMBOL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "SYMBOL.1", "SEQNAME.1", "GENESEQSTART.1", "GENESEQEND.1", "breakpointA", "breakpointB", "split_reads", "split_readsA", "split_readsB", "discordant_mates", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer") ]
    
  ##### Dragen only
  } else if ( runDragenFusionChunk ) {
    
    #####  Dragen's fusion format version 3.9.3
    if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {
      fusion_annot_top.clean <- fusion_annot_top[, c("SYMBOL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "SYMBOL.1", "SEQNAME.1", "GENESEQSTART.1", "GENESEQEND.1", "breakpointA", "breakpointB", "split_reads", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer") ]
      
    #####  Dragen's fusion format prior to version 3.9.3
    } else {
      fusion_annot_top.clean <- fusion_annot_top[, c("SYMBOL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "SYMBOL.1", "SEQNAME.1", "GENESEQSTART.1", "GENESEQEND.1", "breakpointA", "breakpointB", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer") ]
    }
    
  ##### Pizzly only
  } else {
    fusion_annot_top.clean <- fusion_annot_top[, c("SYMBOL", "SEQNAME", "GENESEQSTART", "GENESEQEND", "SYMBOL.1", "SEQNAME.1", "GENESEQSTART.1", "GENESEQEND.1", "split_reads", "discordant_mates", "geneA_dna_support", "geneB_dna_support", "reported_fusion", "fusions_cancer") ]
  }
  
  ##### Order fusions based on the genomic location (chrom and start positions)
  chrOrder <-c((1:22),"X","Y","M")
  
  fusion_annot_top.clean$SEQNAME <- factor(fusion_annot_top.clean$SEQNAME, chrOrder, ordered=TRUE)
  fusion_annot_top.clean$SEQNAME.1 <- factor(fusion_annot_top.clean$SEQNAME.1, chrOrder, ordered=TRUE)
  fusion_annot_top.clean <- fusion_annot_top.clean[do.call(order, fusion_annot_top.clean[, c("SEQNAME", "SEQNAME.1", "GENESEQSTART", "GENESEQSTART.1")]), ]
  
  ##### Dragen + Arriba / Pizzly + Arriba
  if ( runDragenFusionChunk && runArribaChunk) {
    names(fusion_annot_top.clean) <- c("Gene A", "Chrom (A)", "Start (A)", "End (A)", "Gene B", "Chrom (B)", "Start (B)", "End (B)", "Breakpoint (A)", "Breakpoint (B)", "Split reads (Total)", "Split reads (A)", "Split reads (B)", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)")
    
  ##### Dragen only
  } else if ( runDragenFusionChunk ) {
    
    #####  Dragen's fusion format version 3.9.3
    if ( all(c("GeneALocation", "GeneBLocation", "NumSplitReads","NumSoftClippedReads", "Score") %in% colnames(dragen.fusions)) ) {
      names(fusion_annot_top.clean) <- c("Gene A", "Chrom (A)", "Start (A)", "End (A)", "Gene B", "Chrom (B)", "Start (B)", "End (B)", "Breakpoint (A)", "Breakpoint (B)", "Split reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)")
      
    #####  Dragen's fusion format prior to version 3.9.3
    } else {
      names(fusion_annot_top.clean) <- c("Gene A", "Chrom (A)", "Start (A)", "End (A)", "Gene B", "Chrom (B)", "Start (B)", "End (B)", "Breakpoint (A)", "Breakpoint (B)", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)")
    }
    
  ##### Pizzly only
  } else {
    names(fusion_annot_top.clean) <- c("Gene A", "Chrom (A)", "Start (A)", "End (A)", "Gene B", "Chrom (B)", "Start (B)", "End (B)", "Split reads", "Pair reads", "DNA support (A)", "DNA support (B)", "Reported fusion", "Cancer gene(s)")
  }

  fusions.genomicView <- DT::datatable( data = fusion_annot_top.clean, filter="none", rownames = FALSE, extensions = c('Buttons','Scroller'), options = list(pageLength = 10, dom = 'Bfrtip', buttons = c('excel', 'csv', 'pdf','copy','colvis'), scrollX = TRUE, deferRender = TRUE, scrollY = "167px", scroller = TRUE), width = 800, height = 318,  escape = FALSE) %>%
      DT::formatStyle( columns = names(fusion_annot_top.clean), `font-size` = '12px', 'text-align' = 'center' ) %>%
    
      ##### Highlight rows with fusions involving cancer genes (grey)
      DT::formatStyle( columns = colnames(fusion_annot_top.clean) %in% "Cancer gene(s)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'lightgrey')) ) %>%
      DT::formatStyle( columns = colnames(fusion_annot_top.clean) %in% "DNA support (A)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'coral')) ) %>%
      DT::formatStyle( columns = colnames(fusion_annot_top.clean) %in% "DNA support (B)", backgroundColor = DT::styleEqual(c("-", "Yes"), c('transparent', 'coral')) ) %>%
    DT::formatStyle( columns = colnames(fusion_annot_top.clean) %in% "Reported fusion", backgroundColor = DT::styleEqual( c("-", "Yes"), c('transparent', 'lightgreen')) )

  fusions.genomicView
}

##### Clean the space
rm(fusion_annot_top.clean)
```

<details>
<summary>Table legend</summary>
<font size="2">

Cells in <span style="color:#ff0000">RED</span> indicate **DNA-supported** fusion genes (see [Structural variants] section), cells in <span style="color:#02d653">GREEN</span> indicate gene fusions **reported in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"}**, and cells hihglighted in <span style="color:#767689">GREY</span> indicate fusions containing **[Cancer genes]**. Genes known to be involved in gene fusions are flagged based on information provided in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"} and [Cancer Genome Interpreter](https://www.cancergenomeinterpreter.org/biomarkers){target="_blank"} (CGI) databases. Fusion events are ordered by **genomic coordinates** of **Gene A** and then **Gene B**. *DNA support (gene A/B)* - DNA-supported fusion gene(s) (see) [Structural variants] section); *Reported fusion* - fusion event reported in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"}; *Cancer gene(s)* - gene fusion events involving [Cancer genes]

</font>
</details>

```{r genomic_view_table_fusions_save, comment = NA, message=FALSE, warning=FALSE, eval = runFusionChunk}
##### Save the table as html file
if ( nrow(fusion_annot_top) > 0 && params$save_tables ) {
  saveWidgetFix(widget=fusions.genomicView, file=paste(fusionsTableDir, "fusions.genomicView.html", sep = "/"), selfcontained=TRUE)  
}

##### Clean the space and return output
rm(fusions.genomicView)
```

***

### - Top hits {.tabset}

Expression profiles for gene fusion events involving **DNA-supported fusion** genes (see [Structural variants] section), gene fusions **reported in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target="_blank"}** or **[Cancer genes]**, indicated in <span style="color:#02d653">green</span>, <span style="color:#ff0000">red</span> and <span style="color:#767689">grey</span> columns in the [Fusion genes] table, respectively, and with the highest *Split count* and *Pair count* values. 

<span style="color:#ff0000">NOTE</span>: the *visualisation* is available only for fusion genes detected by [Arriba](https://arriba.readthedocs.io/en/latest/){target="_blank"} (see the [- Summary] table).

```{r top_hits_fusions, echo=FALSE, comment = NA, message=FALSE, warning=FALSE}
suppressMessages(library(plotly))
##### Provide detailed expression info for the top ranked fusion events
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]
genes <- as.character(fusions$geneA)

##### Create lists to store
output_cdf_A <- list()
output_cdf_B <- list()
output_counts_A <- list()
output_counts_B <- list()
output_density_A <- list()
output_density_B <- list()
output_table_Z <- list()
output_table_perc <- list()

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes) == 0 ) {
  genes <- NULL
  genes_no <- 0
} else if ( length(genes) > params$top_genes ) {
  genes_no <- params$top_genes
} else {
  genes_no <- length(genes)
}

##### Add genes B to the fusions gene list
genes <- c(genes, as.character(fusions$geneB))

##### Collect info and plots for each of the top fusions
for( i in 1:genes_no ) {
  if ( genes_no > 0 ) {
    geneA <- as.vector(fusions$geneA[i])
    geneB <- as.vector(fusions$geneB[i])
    
    ##### For each gene generate (1) CDF plot and add boxplot below to show the data variance for selected gene in individual groups, (2) bar-plot of read count data across all samples and (3) density plot to demonstrate expression distribution in investigated sample 
    if ( geneA %in% rownames(data) ) {
      
      ##### CDF plot
      output_cdf_A[[i]] <- cdfPlot(gene = geneA, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, addBoxPlot = FALSE, scaling = scaling, report_dir = results_dir)
      
      ##### Bar-plot of read counts
      ##### First map the gene symbol to Ensmebl ID (used in the counts data)
      genes.ENSEMBL <- ref_dataset.list[[dataset]][["gene_annot_all"]]$ENSEMBL[ ref_dataset.list[[dataset]][["gene_annot_all"]]$SYMBOL ==  geneA ]
      
      output_counts_A[[i]] <- barPlot(gene = genes.ENSEMBL, data = ref_dataset.list[[dataset]][["combined_data"]], y_title = "Counts", targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group )
      
      ##### Density plot - expression distribution
      output_density_A[[i]] <- densityPlot(gene = geneA, data = data, main_title= "", x_title = "Z-score", sampleName = sample_name, distributions = c("normal", "bimodal"), scaling = scaling)
    }
    
    ##### Gene B
    if ( geneB %in% rownames(data) && geneB != geneA) {
      
      ##### CDF plot
      output_cdf_B[[i]] <- cdfPlot(gene = geneB, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, addBoxPlot = FALSE, scaling = scaling, report_dir = results_dir)
      
      ##### Bar-plot of read counts
      ##### First map the gene symbol to Ensmebl ID (used in the counts data)
      genes.ENSEMBL <- ref_dataset.list[[dataset]][["gene_annot_all"]]$ENSEMBL[ ref_dataset.list[[dataset]][["gene_annot_all"]]$SYMBOL ==  geneB ]
      
      output_counts_B[[i]] <- barPlot(gene = genes.ENSEMBL, data = ref_dataset.list[[dataset]][["combined_data"]], y_title = "Counts", targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group )
      
      ##### Density plot - expression distribution
      output_density_B[[i]] <- densityPlot(gene = geneB, data = data, main_title= "", x_title = "Z-score", sampleName = sample_name, distributions = c("normal", "bimodal"), scaling = scaling) 
    }
      
    ##### Generate expression summary tables
    genes <- c(geneA, geneB)
      
    ##### Z-scores
    output_table_Z[[i]] <- exprTable( genes = unique(genes), data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Germline") ], fusion_genes = unique(known_translocations$geneA, known_translocations$geneB ), ext_links = TRUE, type = "z", scaling = scaling)[[1]]
      
    ##### Percentiles
    output_table_perc[[i]] <- exprTable( genes = unique(genes), data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Germline") ], fusion_genes = unique(known_translocations$geneA, known_translocations$geneB ), ext_links = TRUE, type = "perc", scaling = scaling)[[1]]
  }
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

```{r top_hits_fusions_display, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 2.5, eval = runFusionChunk, results="asis"}
##### Now once the plots are ready show them in separate tabs
if ( genes_no > 0 ) {
  for( i in 1:genes_no ) {
    geneA <- as.vector(fusions$geneA[i])
    geneB <- as.vector(fusions$geneB[i])
    
    breakpointA <- as.vector(fusions$breakpointA[i])
    breakpointB <- as.vector(fusions$breakpointB[i])
    
    cat("\n#### ", paste(fusions$geneA[i], fusions$geneB[i], sep="-"), "\n")
    
    ##### Check if Arriba fusion plot exists. Skip this section if it doesn't
    if ( file.exists(gsub(":", ".", paste0(results_dir, "/arriba/", make.names(paste(geneA, geneB, sep = "__")), "_", breakpointA, "-", breakpointB, ".png"))) ) {
      cat("\n##### Fusion genes visualisation\n")
  
      ##### Present Arriba plots it in the report
      cat(paste0("![](", gsub(":", ".", paste0(results_dir, "/arriba/", make.names(paste(geneA, geneB, sep = "__")), "_", breakpointA, "-", breakpointB, ".png)"))), "\n")
      cat("\n***\n")
    }
    
    cat("\n##### Fusion genes expression\n")
    cat("\nmRNA expression levels of fusion genes detected in patient's sample and their average mRNA expression (Z-score) in samples from cancer cohorts.\n")
      
    ##### Display CDF plots for each fusion gene pair
    suppressMessages(library(plotly))
      
    if ( geneA %in% rownames(data) ) {
      cat(renderTags(output_cdf_A[[i]])$html)
      cat("\n<details>\n")
      cat("\n<summary>Plot legend</summary>\n")
      cat("<font size=\"2\">\n")
      cat(paste0("Distribution of percentile values (*y-axis*) as a function of expression levels (Z-scores, *x-axis*) of *", geneA, "* in patient's sample (*black dot*) and other reference cancer cohort(s) (median value(s)).\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Read counts</summary>\n")
      cat(renderTags(output_counts_A[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Bar-plot illustrating read counts for *", geneA, "* across all samples. The *", geneA, "* read count in patient's sample is indicated by *black bar*.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Expression distribution patterns</summary>\n")
      cat(renderTags(output_density_A[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Plot illustrating distribution of expression levels (Z-scores) of *", geneA, "* *observed* across all samples along with simulated *normal* and *bimodal* distributions. The *", geneA, "* expression level observed in patient's sample is indicated by *black dot* in each distribution.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("<br />")
    } else {
      cat(paste0("\n<span style=\"color:#ff0000\">NOTE</span>, expression data is not available for ", geneA, ".\n"))
    }
    
    if ( geneB == geneA ) {
      cat(paste0("\n"))
    } else if ( geneB %in% rownames(data)  ) {
      cat(renderTags(output_cdf_B[[i]])$html)
      cat("\n<details>\n")
      cat("\n<summary>Plot legend</summary>\n")
      cat("<font size=\"2\">\n")
      cat(paste0("Distribution of percentile values (*y-axis*) as a function of expression levels (Z-scores, *x-axis*) of *", geneB, "* in patient's sample (*black dot*) and other reference cancer cohort(s) (median value(s)).\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Read counts</summary>\n")
      cat(renderTags(output_counts_B[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Bar-plot illustrating read counts for *", geneB, "* across all samples. The *", geneB, "* read count in patient's sample is indicated by *black bar*.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Expression distribution patterns</summary>\n")
      cat(renderTags(output_density_B[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Plot illustrating distribution of expression levels (Z-scores) of *", geneB, "* *observed* across all samples along with simulated *normal* and *bimodal* distributions. The *", geneB, "* expression level observed in patient's sample is indicated by *black dot* in each distribution.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n***\n")
    } else {
      cat(paste0("\n<span style=\"color:#ff0000\">NOTE</span>, expression data is not available for ", geneB, ".\n"))
    }
      
    cat("\n##### Summary table {.tabset}\n")
    cat("\n###### Percentiles\n")
    cat(renderTags(output_table_perc[[i]])$html)
    cat("<br />")
    cat("\n<details>\n")
    cat("\n<summary>Table legend</summary>\n")
    cat("\n<font size=\"2\">\n")
    cat("\nThe <span style=\"color:#ff0000\">RED</span> colour range indicate relatively **high expression** (percentile) values and <span style=\"color:#0000ff\">BLUE</span> colour range indicate relatively **low expression** (percentile) values in individual sample group. The **Diff** (**Patient vs ", comp_cancer_group, " **) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each fusion gene. Genes considered to be oncogenes or tumour suppressor genes, according to [OncoKB](http://oncokb.org/#/cancerGenes){target=\"_blank\"} database, are also indicated. Genes are ordered by **decreasing** absolute values in the **Diff** (**Patient vs ", comp_cancer_group, "**) column. *TSG* - tumour suppressor gene\n")
    cat("\n</font>\n")
    cat("\n</details>\n")
    
    if ( length(genes) > 2000 ) { 
        cat(paste0("<span style=\"color:#ff0000\">NOTE</span>, the table was truncated to 2000 entries.")) 
    }
    
    cat("\n***\n")
    
    cat("\n###### Z-scores\n")
    cat(renderTags(output_table_Z[[i]])$html)
    cat("<br />")
    cat("\n<details>\n")
    cat("\n<summary>Table legend</summary>\n")
    cat("\n<font size=\"2\">\n")
    cat(paste0("\nThe <span style=\"color:#ff0000\">RED</span> colour range indicate relatively **high expression** (Z-score) values and <span style=\"color:#0000ff\">BLUE</span> colour range indicate relatively **low expression** (Z-score) values in individual sample group. The **Diff** (**Patient vs ",  comp_cancer_group, "**) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each fusion gene. Genes considered to be oncogenes or tumour suppressor genes, according to [OncoKB](http://oncokb.org/#/cancerGenes){target=\"_blank\"} database, are also indicated. Genes are ordered by **decreasing** absolute values in the **Diff** (**Patient vs ", comp_cancer_group, "**) column. *TSG* - tumour suppressor gene\n"))
    cat("\n</font>\n")
    cat("\n</details>\n")
    
    if ( length(genes) > 2000 ) { 
        cat(paste0("<span style=\"color:#ff0000\">NOTE</span>, the table was truncated to 2000 entries.")) 
    }
    
    cat("\n***\n")
  }
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
  
} else {
  cat("\nNo alterations were reported.\n")
   cat("\n***\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

##### Clean the space
rm(list = ls(pattern='^output*'))
```

***

## Structural variants

mRNA expression levels of genes located within detected structural variants (SVs), obtained from [Manta](https://github.com/Illumina/manta){target="_blank"} SV caller, in patient's sample and their average mRNA expression in samples from cancer cohorts.

`r if ( !runSVsChunk ) { c("SVs information for this sample is **NOT AVAILABLE**") }` 

### - Summary table {.tabset}

Out of the `r if ( runSVsChunk ) { length(unique(manta_sv$Gene)) } ` genes affected by `r if ( runSVsChunk ) { nrow(manta_sv) }` SVs, the expression of **`r if ( runSVsChunk ) { length(which(unique(manta_sv$Gene) %in% rownames(ref_dataset.list[[dataset]][["data_to_report"]]))) } else { length(NULL) }`** was reliably measured in patient's sample. The remaining `r if ( runSVsChunk ) { length(which(unique(manta_sv$Gene) %!in% rownames(ref_dataset.list[[dataset]][["data_to_report"]]))) }` genes are either not expressed or their expression level is too low to be detected (indicated in <span style="color:#808080">BLANK</span> cells with missing values).


#### Percentiles

```{r sv_genes_table_perc, comment = NA, message=FALSE, warning=FALSE, eval = runSVsChunk}
##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",Structural variants")
mysql_populate_update <- paste0(mysql_populate_update, ",Structural variants")

##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]

##### Consider only SVs with known genes and those in MANTA output for which the expression levels were measured
genes <- unique(manta_sv$Gene)
genes <- genes[ genes %in% ref_dataset.list[[dataset]][["gene_annot_all"]]$SYMBOL ]

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes) == 0 ) {
  genes <- NULL
  limit_genes <- FALSE
  genes_no <- 0
} else if ( length(genes) > params$top_genes ) {
  limit_genes <- TRUE
  genes_no <- params$top_genes
} else {
  limit_genes <- FALSE
  genes_no <- length(genes)
}

sv_genes.expr.perc <- exprTable( genes = genes, data = data, sv_data = manta_sv, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)

##### Present the expression summary table
sv_genes.expr.perc[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=sv_genes.expr.perc[[1]], file=paste(exprTableDir, "sv_genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
}
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (percentile) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (percentile) values in individual sample group. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each gene. Genes considered to be oncogenes or tumour suppressor genes, according to [OncoKB](http://oncokb.org/#/cancerGenes){target="_blank"} database, are also indicated. Genes are ordered by **increasing SV score** and then by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) columns. *TSG* - tumour suppressor gene

**Tier**: SV priority score based on AstraZeneca [simple_sv_annotation.py](https://github.com/AstraZeneca-NGS/simple_sv_annotation/blob/master/simple_sv_annotation.py#L21-L36) script; *1 = high* and *4 = low priority*

</font>
</details>

`r if ( runSVsChunk && length(genes) > 2000 ) { c(paste0("<span style=\"color:#ff0000\">NOTE</span>, the table was truncated to 2000 entries.")) } else { cat("") }`

***

#### Z-scores

```{r sv_genes_table, comment = NA, message=FALSE, warning=FALSE, eval = runSVsChunk}
##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
sv_genes.expr.z <- exprTable( genes = genes, data = data, sv_data = manta_sv, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "z", scaling = scaling)[[1]]

##### Present the expression summary table
sv_genes.expr.z

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=sv_genes.expr.z, file=paste(exprTableDir, "sv_genes.expr.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(sv_genes.expr.z)
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (Z-score) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (Z-score) values in individual sample group. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each gene. Genes considered to be oncogenes or tumour suppressor genes, according to [OncoKB](http://oncokb.org/#/cancerGenes){target="_blank"} database, are also indicated. Genes are ordered by **increasing SV score** and then by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column. *TSG* - tumour suppressor gene

**Tier**: SV priority score based on AstraZeneca [simple_sv_annotation.py](https://github.com/AstraZeneca-NGS/simple_sv_annotation/blob/master/simple_sv_annotation.py#L21-L36){target="_blank"} script; **1 = high** and **4 = low priority**

</font>
</details>

`r if ( runSVsChunk && length(genes) > 2000 ) { c(paste0("<span style=\"color:#ff0000\">NOTE</span>, the table was truncated to 2000 entries.")) } else { cat("") }`

***

### - Expression profiles {.tabset}

`r if ( exists("limit_genes") ) { if ( limit_genes ) { c(paste0("Expression profiles for ", genes_no, " SVs-affected genes with the highest priority (low [tier](https://github.com/AstraZeneca-NGS/simple_sv_annotation/blob/master/simple_sv_annotation.py#L21-L36){target=\"_blank\"}) and demonstrating the greatest difference in mRNA expression (percentile) values between patient's sample and the average mRNA expression in samples from cancer patients.")) } else { cat(" ") }}`

```{r cdf_plots_sv, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runSVsChunk, results="asis"}
suppressMessages(library(plotly))
##### Generate empirical cumulative distribution function (ECDF) plot illustrating mRNA expression level for the genes of interest in the context of the overall mRNA expression distribution
output_cdf <- list()
output_counts <- list()
output_density <- list()
genes <- unique(sv_genes.expr.perc[[2]]$SYMBOL)

##### For each gene generate (1) CDF plot and add boxplot below to show the data variance for selected gene in individual groups, (2) bar-plot of read count data across all samples and (3) density plot to demonstrate expression distribution in investigated sample 
for( i in 1:genes_no ) {
  if ( genes_no > 0 && genes[i] %in% rownames(data) ) {
    
    ##### CDF plot
    output_cdf[[i]] <- cdfPlot(gene = genes[i], data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, addBoxPlot = TRUE, scaling = scaling, report_dir = results_dir)
    
    ##### Bar-plot of read counts
    ##### First map the gene symbol to Ensmebl ID (used in the counts data)
    genes.ENSEMBL <- ref_dataset.list[[dataset]][["gene_annot_all"]]$ENSEMBL[ ref_dataset.list[[dataset]][["gene_annot_all"]]$SYMBOL ==  genes[i] ]
    
    output_counts[[i]] <- barPlot(gene = genes.ENSEMBL, data = ref_dataset.list[[dataset]][["combined_data"]], y_title = "Counts", targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group )
    
    ##### Density plot - expression distribution
    output_density[[i]] <- densityPlot(gene = genes[i], data = data, main_title= "", x_title = "Z-score", sampleName = sample_name, distributions = c("normal", "bimodal"), scaling = scaling) 
  }
}

##### Now once the plots are ready show them in separate tabs
if ( genes_no != 0 ) {
  for( i in 1:genes_no ){
    if ( genes[i] %in% rownames(data) ) {
      cat("\n#### ", genes[i], "\n")
      cat(renderTags(output_cdf[[i]])$html)
      cat("\n<details>\n")
      cat("\n<summary>Plot legend</summary>\n")
      cat("<font size=\"2\">\n")
      cat(paste0("**Top panel**: distribution of percentile values (*y-axis*) as a function of expression levels (Z-scores, *x-axis*) for *", genes[i], "* in patient's sample (*black dot*) and other reference cancer cohort(s) (median value(s)).\n\n"))
      cat(paste0("**Bottom panel**: box-plot presenting expression level (Z-score) of *", genes[i], "* in patient's sample (*black dot*) and its expression levels observed across samples from other reference cancer cohort(s).\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Read counts</summary>\n")
      cat(renderTags(output_counts[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Bar-plot illustrating read counts for *", genes[i], "* across all samples. The *", genes[i], "* read count in patient's sample is indicated by *black bar*.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Expression distribution patterns</summary>\n")
      cat(renderTags(output_density[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Plot illustrating distribution of expression levels (Z-scores) of *", genes[i], "* *observed* across all samples along with simulated *normal* and *bimodal* distributions. The *", genes[i], "* expression level observed in patient's sample is indicated by *black dot* in each distribution.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n***\n")
    } else {
      cat("\n#### ", genes[i], "\n")
      cat("\n<span style=\"color:#ff0000\">NOTE</span>, expression data is not available for that gene.\n")
      cat("\n***\n")
    }
  }
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
  
} else {
  cat("\nNo alterations were reported.\n")
   cat("\n***\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

##### Clean the space
rm(list = ls(pattern='^output*'))
rm(limit_genes)
```

`r if ( !runSVsChunk ) { c("***") } else { c(" ") }`

## CN altered genes

Section overlaying the mRNA expression data with per-gene somatic copy-number (CN) data (from [PURPLE](https://github.com/hartwigmedical/hmftools/tree/master/purity-ploidy-estimator){target="_blank"}), as well as SNVs/indels and SVs data, if available.

`r if ( !runPurpleChunk ) { c("CN information for this sample is **NOT AVAILABLE**.") }`

### - Genomic view

`r if (runPurpleChunk) { length(ref_dataset.list[[dataset]][["expr_mut_cn_data_all"]][,1])} else { cat("0") }` genes with available CN data (*y-axis*) are presented in the genomic context (*x-axis*). **`r if (runPurpleChunk) { length(ref_dataset.list[[dataset]][["expr_mut_cn_data"]][,1]) } else { length(NULL) }`** of them (indicated by *various colours*) are [Cancer genes] and are gained `r if ( runPurpleChunk ) { paste0("(CN values >= ", cn_top, ")") }` or lost `r if ( runPurpleChunk ) { paste0("(CN values =< ", cn_bottom, ")") }`. All other genes are marked in *<span style="color:#808080">gray</span>* or *black*.

```{r cn_genomic_view, comment = NA, message=FALSE, warning=FALSE, fig.width = 8.3, fig.height = 4, eval = runPurpleChunk}
##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",CN altered genes")
mysql_populate_update <- paste0(mysql_populate_update, ",CN altered genes")

##### Generate genomic view plot with per-gene CN values (y-axis) along chromosomal coordinates (x-axis)
suppressMessages(library(manhattanly))
suppressMessages(library(plotly))

data <- ref_dataset.list[[dataset]][["expr_mut_cn_data_all"]]
data.sub <- ref_dataset.list[[dataset]][["expr_mut_cn_data"]]

##### Add SNVs
if ( runPcgrChunk ) {
  data$Alterations <- as.character(data$Alterations)
  data.sub$Alterations <- as.character(data.sub$Alterations)
}

##### Add fusion genes
if ( runFusionChunk ) {
  
  ##### Change the alteration type to "fusion" for fusion genes
  data$Alterations[ data$Gene %in% fusions$geneA  ] <- paste0( data$Alterations[ data$Gene %in% fusions$geneA  ], "; Fusion")
  data.sub$Alterations[ data.sub$Gene %in% fusions$geneA  ] <- paste0( data.sub$Alterations[ data.sub$Gene %in% fusions$geneA  ], "; Fusion")
  data$Alterations[ data$Gene %in% fusions$geneB  ] <- paste0( data$Alterations[ data$Gene %in% fusions$geneB  ], "; Fusion")
  data.sub$Alterations[ data.sub$Gene %in% fusions$geneB  ] <- paste0( data.sub$Alterations[ data.sub$Gene %in% fusions$geneB  ], "; Fusion")
}
            
##### Add genes involved in SVs (if data available)
if ( runSVsChunk ) {
  
  ##### Change the alteration type to "fusion" for fusion genes
  data$Alterations[ data$Gene %in% unique(manta_sv$Gene)  ] <- paste0( data$Alterations[ data$Gene %in% unique(manta_sv$Gene)  ], "; SV")
  ##### Change the alteration type to "fusion" for fusion genes
  data.sub$Alterations[ data.sub$Gene %in% unique(manta_sv$Gene)  ] <- paste0( data.sub$Alterations[ data.sub$Gene %in% unique(manta_sv$Gene)  ], "; SV")
}

##### Remove altaration status "None" for gene which are not mutated but are involved in fusions or SVs
data$Alterations <- gsub( "None", "CN", data$Alterations)
data.sub$Alterations <- gsub( "None", "CN", data.sub$Alterations)

##### Prepare dataframe for manhattanly
##### Keep only genes for which both genes have gene symbol (and genomics location) available
data <- data[ data$Gene %in% ref_dataset.list[[dataset]][["gene_annot"]]$SYMBOL, ]
names(data)[match("CN", names(data))] <- "P"

##### Merge genes genomic coordinates info with their annotation and expression data
data.annot <- merge(data, ref_dataset.list[[dataset]][["gene_annot"]], by.x = "Gene", by.y = "SYMBOL", all.x = FALSE)
data.annot$SEQNAME <- as.numeric(data.annot$SEQNAME)
data.annot$GENESEQSTART <- as.numeric(data.annot$GENESEQSTART)
data.annot <- data.annot[ !is.na(data.annot$SEQNAME), ]

if ( nrow(data.annot) > 0 ) {
  
  ##### Get plot results first to extract x-axis coordinated to annotate genes of interest
  manhattanr.res <- manhattanr(x = data.annot, chr = "SEQNAME", bp = "GENESEQSTART", p = "P", snp = "Gene", gene = "Z_score_diff", annotation1 = "Perc_diff", annotation2 = "Alterations", logp = FALSE)
  
  ##### Restrict the results to the genes of interest
  manhattanr.res$data <- manhattanr.res$data[ manhattanr.res$data$Gene %in% data.sub$Gene, ]
  
  p <- manhattanly(x = data.annot, chr = "SEQNAME", bp = "GENESEQSTART", p = "P", snp = "Gene", gene = "Z_score_diff", annotation1 = "Perc_diff", annotation2 = "Alterations", suggestiveline = cn_top, genomewideline  = cn_bottom, suggestiveline_color = "gray", genomewideline_color = "gray", ylab = "CN value", showgrid = FALSE, title = "", logp = FALSE) %>%
    
    add_markers(y = manhattanr.res$data$P, x = manhattanr.res$data$pos, 
                name = manhattanr.res$data$Gene,
                text = paste0("Gene: ", manhattanr.res$data$Gene, "\nZ_score_diff: ", manhattanr.res$data$Z_score_diff, "\nPerc_diff: ", manhattanr.res$data$Perc_diff, "\nAlterations: ", manhattanr.res$data$Alterations, "\nchr: ", manhattanr.res$data$CHR),
                mode = 'markers',
                marker = list(size=10, symbol="circle"),
                color = manhattanr.res$data$Gene,
                showlegend = TRUE,
                legendtitle=TRUE, 
                inherit = FALSE) %>%
    
    add_annotations( data = manhattanr.res$data, text=~Gene,
                      x=~pos, xanchor="left",
                      y=~P, yanchor="top",
                      font = list(color = "Grey", size = 10),
                      legendtitle=TRUE,
                      showarrow=FALSE )
  
  ##### Create directory for the plots
  PlotDir <- paste(results_dir, "cn_genomic_view", sep = "/")
  if ( !file.exists(PlotDir) ) {
    dir.create(PlotDir, recursive=TRUE)
  }
  
  ##### Save interactive plot as html file
  saveWidgetFix(p, file = paste(PlotDir, "cn_genomic_view.html", sep = "/"))
  
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
  
} else {
  cat("None of the genes of interest are affected by changes in CN.")
  p <- NULL
}

p

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:manhattanly", unload=FALSE)
detach("package:plotly", unload=FALSE)
```

***

<details>
<summary>CN data distribution</summary>

`r if ( runPurpleChunk ) { c(" ") } else { c("CN information for this sample is NOT AVAILABLE.") }`

```{r cn_data_distribution_plot, comment = NA, message=FALSE, warning=FALSE, fig.width = 12, fig.height = 4, eval = runPurpleChunk }
##### Generate a histogram illustrating CN data distribution
suppressMessages(library(plotly))
cn_dist_plot

##### Create directory for the plots
PlotDir <- paste(results_dir, "cn_dist_plot", sep = "/")
if ( !file.exists(PlotDir) ) {
  dir.create(PlotDir, recursive=TRUE)
}

##### Save interactive plot as html file
saveWidgetFix(cn_dist_plot, file = paste(PlotDir, "cn_dist_plot.html", sep = "/"))
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

</details>

***

### - Expression vs CN {.tabset}

Scatterplot comparing the per-gene difference in **mRNA expression** of [Cancer genes] between patient's sample and cancer individuals (*y-axis*), and **CN values** (*x-axis*, from [PURPLE](https://github.com/hartwigmedical/hmftools/tree/master/purity-ploidy-estimator){target="_blank"}).

#### Percentiles

```{r cn_expr_data_plot_perc, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 4, eval = runPurpleChunk}
##### Generate scatterplot with per-gene expression values (y-axis), CN values (x-axis) and mutation status info (colours)
suppressMessages(library(plotly))
cn_genes <- data.sub$Gene

if ( runPcgrChunk && length(cn_genes) > 0 ) {
  mutCNexprPlot(data = data.sub, alt_data = TRUE, cn_bottom = cn_bottom, cn_top = cn_top, comp_cancer = comp_cancer_group, type = "perc", report_dir = results_dir)
  
} else if ( length(cn_genes) > 0) {
  data.sub <- data.sub[ data.sub$Gene %in% cn_genes, ]
  mutCNexprPlot(data = data.sub, alt_data = FALSE, cn_bottom = cn_bottom, cn_top = cn_top, comp_cancer = comp_cancer_group, type = "perc", report_dir = results_dir)
  
} else {
  cn_genes <- NULL
  cat("None of the genes of interest are affected by changes in CN.")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

***

#### Z-scores

```{r cn_expr_data_plot, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 4, eval = runPurpleChunk}
##### Generate scatterplot with per-gene expression values (y-axis), CN values (x-axis) and mutation status info (colours)
suppressMessages(library(plotly))

if ( runPcgrChunk && length(cn_genes) > 0 ) {
  data.sub <- data.sub[ data.sub$Gene %in% cn_genes, ]
  mutCNexprPlot(data = data.sub, alt_data = TRUE, cn_bottom = cn_bottom, cn_top = cn_top, comp_cancer = comp_cancer_group, type = "z", report_dir = results_dir)
  
} else if ( length(cn_genes) > 0) {
  data.sub <- data.sub[ data.sub$Gene %in% cn_genes, ]
  mutCNexprPlot(data = data.sub, alt_data = FALSE, cn_bottom = cn_bottom, cn_top = cn_top, comp_cancer = comp_cancer_group, type = "z", report_dir = results_dir)
  
} else {
  cn_genes <- NULL
  cat("None of the genes of interest are affected by changes in CN.")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

***

### - Summary table {.tabset}

Out of the `r if (runPurpleChunk) { length(ref_dataset.list[[dataset]][["expr_mut_cn_data_all"]][ ref_dataset.list[[dataset]][["expr_mut_cn_data_all"]]$CN <= cn_bottom | ref_dataset.list[[dataset]][["expr_mut_cn_data_all"]]$CN >= cn_top, ][,1]) } else { cat("0") }` genes within gained `r if ( runPurpleChunk ) { paste0("(CN values >= ", cn_top, ")") }` or lost `r if ( runPurpleChunk ) { paste0("(CN values =< ", cn_bottom, ")") }` regions `r if (runPurpleChunk) { length(ref_dataset.list[[dataset]][["expr_mut_cn_data"]][,1]) } else { length(NULL) }` are [Cancer genes]. The expression of **`r if (runPurpleChunk) { length(which(ref_dataset.list[[dataset]][["expr_mut_cn_data"]]$Gene %in% rownames(ref_dataset.list[[dataset]][["data_to_report"]]))) } else { length(NULL) }`** of these genes was reliably measured in patient's sample. The remaining `r if (runPurpleChunk) { length(which(cn_genes %!in% rownames(ref_dataset.list[[dataset]][["data_to_report"]]))) } else { length(NULL) }` genes are either not expressed or their expression level is too low to be detected (indicated in <span style="color:#808080">BLANK</span> cells with missing values).

#### Gains {.tabset}

Table summarising the **mRNA expression** values in cancer and patient samples for genes with **CN** values >= `r if ( runPurpleChunk ) { cn_top } else { c("(NA)") }` (**gains**), based on patient's genomic data (from [PURPLE](https://github.com/hartwigmedical/hmftools/tree/master/purity-ploidy-estimator){target="_blank"}), and mutation status if available (from [PCGR](https://github.com/sigven/pcgr){target="_blank"}).

##### Percentiles

```{r cn_expr_data_table_gains_perc, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
##### Keep only genes within CN gains
cn_data <- ref_dataset.list[[dataset]][["expr_mut_cn_data"]]
cn_data <- cn_data[ cn_data$CN >= cn_top, ]
cn_data <- cn_data[, "CN", drop=FALSE]
genes_gains = as.character(cn_genes[ cn_genes %in% rownames(cn_data) ])

##### Deal with no genes
if ( length(genes_gains) == 0 ) {
  genes_gains <- NULL
  genes_gains_no <- 0
} else if ( length(genes_gains) > params$top_genes ) {
  genes_gains_no <- params$top_genes
} else {
  genes_gains_no <- length(genes_gains)
}

##### Get expression data
data <- ref_dataset.list[[dataset]][["data_to_report"]]

if ( runPcgrChunk && runPurpleChunk ) {
  cn_expr_genes.expr.gains.perc <- exprTable( genes = genes_gains, data = data, cn_data = cn_data, cn_decrease = TRUE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)
  
##### Generate expression summary table for per-gene expression values and CN values
} else if ( runPurpleChunk ) {
  cn_expr_genes.expr.gains.perc <- exprTable( genes = genes_gains, data = data, cn_data = cn_data, cn_decrease = TRUE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)
}
  
##### Present the expression, CN and mutation data summary table
cn_expr_genes.expr.gains.perc[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cn_expr_genes.expr.gains.perc[[1]], file=paste(exprTableDir, "cn_expr_genes.expr.gains.perc.html", sep = "/"), selfcontained=TRUE)
}
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (percentile) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (percentile) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each gene. The **CN values** based on patient's genomic data are presented in *Patient (CN)* column with a horizontal blue bar indicating the CN value of each gene in the context of other genes. If mutation data is availbale, then the variants’ tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided on right-hand side based on information from [PCGR](https://github.com/sigven/pcgr){target="_blank"} report (similar to [Mutated genes] section). Genes are ordered by **Patient (CN)** and then by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) columns. *CN* - copy-number

</font>
</details>

`r if ( runPurpleChunk && length(genes_gains) > 2000 ) { c(paste0("<span style=\"color:#ff0000\">NOTE</span>, the table was truncated to 2000 entries.")) } else { cat("") }`

***

##### Z-scores

```{r cn_expr_data_table_gains, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runPurpleChunk}
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
if ( runPcgrChunk && runPurpleChunk ) {
  cn_expr_genes.expr.gains.z <- exprTable( genes = genes_gains, data = data, cn_data = cn_data, cn_decrease = TRUE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "z", scaling = scaling)
  
##### Generate expression summary table for per-gene expression values and CN values
} else if ( runPurpleChunk ) {
  cn_expr_genes.expr.gains.z <- exprTable( genes = genes_gains, data = data, cn_data = cn_data, cn_decrease = TRUE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "z", scaling = scaling)
}
  
##### Present the expression, CN and mutation data summary table
cn_expr_genes.expr.gains.z[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cn_expr_genes.expr.gains.z[[1]], file=paste(exprTableDir, "cn_expr_genes.expr.gains.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(cn_expr_genes.expr.gains.z, cn_data)
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (Z-score) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (Z-score) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each gene. The **CN values** based on patient's genomic data are presented in *Patient (CN)* column with a horizontal blue bar indicating the CN value of each gene in the context of other genes. If mutation data is availbale, then the variants’ tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided on right-hand side based on information from [PCGR](https://github.com/sigven/pcgr){target="_blank"} report (similar to [Mutated genes] section). Genes are ordered by **Patient (CN)** and then by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) columns. *CN* - copy-number

</font>
</details>

`r if ( runPurpleChunk && length(genes_gains) > 2000 ) { c(paste0("<span style=\"color:#ff0000\">NOTE</span>, the table was truncated to 2000 entries.")) } else { cat("") }`

***

#### Losses {.tabset}

Table summarising the **mRNA expression** values in cancer and patient samples for genes with **CN** values =< `r if ( runPurpleChunk ) { cn_bottom } else { c("(NA)") }` (**losses**), based on patient's genomic data (from [PURPLE](https://github.com/hartwigmedical/hmftools/tree/master/purity-ploidy-estimator){target="_blank"}), and mutation status if available (from [PCGR](https://github.com/sigven/pcgr){target="_blank"}).

##### Percentiles

```{r cn_expr_data_table_losses_perc, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
##### Keep only genes within CN losses
cn_data <- ref_dataset.list[[dataset]][["expr_mut_cn_data"]]
cn_data <- cn_data[ cn_data$CN <= cn_bottom, ]
cn_data <- cn_data[, "CN", drop=FALSE]
genes_losses = as.character(cn_genes[ cn_genes %in% rownames(cn_data) ])

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes_losses) == 0 ) {
  genes_losses <- NULL
  genes_losses_no <- 0
} else if ( length(genes_losses) > params$top_genes ) {
  genes_losses_no <- params$top_genes
} else {
  genes_losses_no <- length(genes_losses)
}
  
if ( genes_gains_no + genes_losses_no > params$top_genes ) {
  limit_genes <- TRUE
} else {
  limit_genes <- FALSE
}

##### Get expression data
data <- ref_dataset.list[[dataset]][["data_to_report"]]

if ( runPcgrChunk && runPurpleChunk ) {
  cn_expr_genes.expr.losses.perc <- exprTable( genes = genes_losses, data = data, cn_data = cn_data, cn_decrease = FALSE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)
  
##### Generate expression summary table for per-gene expression values and CN values
} else if ( runPurpleChunk ) {
  cn_expr_genes.expr.losses.perc <- exprTable( genes = genes_losses, data = data, cn_data = cn_data, cn_decrease = FALSE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "perc", scaling = scaling)
}
  
##### Present the expression, CN and mutation data summary table
cn_expr_genes.expr.losses.perc[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cn_expr_genes.expr.losses.perc[[1]], file=paste(exprTableDir, "cn_expr_genes.expr.losses.perc.html", sep = "/"), selfcontained=TRUE)
}
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (percentile) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (percentile) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each gene. The **CN values** based on patient's genomic data are presented in *Patient (CN)* column with a horizontal blue bar indicating the CN value of each gene in the context of other genes. If mutation data is availbale, then the variants’ tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided on right-hand side based on information from [PCGR](https://github.com/sigven/pcgr){target="_blank"} report (similar to [Mutated genes] section). Genes are ordered by **Patient (CN)** and then by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) columns. *CN* - copy-number

</font>
</details>

`r if ( runPurpleChunk && length(genes_losses) > 2000 ) { c(paste0("<span style=\"color:#ff0000\">NOTE</span>, the table was truncated to 2000 entries.")) } else { cat("") }`

***

##### Z-scores

```{r cn_expr_data_table_losses, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
##### Generate expression summary table for per-gene expression values CN values and mutation status info (colours)
if ( runPcgrChunk && runPurpleChunk ) {
  cn_expr_genes.expr.losses.z <- exprTable( genes = genes_losses, data = data, cn_data = cn_data, cn_decrease = FALSE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], mut_annot = ref_genes.list[["pcgr"]][, c("SYMBOL", "TIER", "CONSEQUENCE", "VARIANT_CLASS", "AF_TUMOR", "GENOMIC_CHANGE", "PROTEIN_CHANGE")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "z", scaling = scaling)
  
##### Generate expression summary table for per-gene expression values and CN values
} else if ( runPurpleChunk ) {
  cn_expr_genes.expr.losses.z <- exprTable( genes = genes_losses, data = data, cn_data = cn_data, cn_decrease = FALSE, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]][, c("Oncogene", "TSG", "Fusion", "Germline") ], ext_links = TRUE, type = "z", scaling = scaling)
}
  
##### Present the expression, CN and mutation data summary table
cn_expr_genes.expr.losses.z[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cn_expr_genes.expr.losses.z[[1]], file=paste(exprTableDir, "cn_expr_genes.expr.losses.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space and return output
rm(cn_data, cn_expr_genes.expr.losses.z)
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (Z-score) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (Z-score) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each gene. The **CN values** based on patient's genomic data are presented in *Patient (CN)* column with a horizontal blue bar indicating the CN value of each gene in the context of other genes. If mutation data is availbale, then the variants’ tier, consequence, class and tumour allele freuqnecy (AF), as well as genomic and protein change are also provided on right-hand side based on information from [PCGR](https://github.com/sigven/pcgr){target="_blank"} report (similar to [Mutated genes] section). Genes are ordered by **Patient (CN)** and then by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) columns. *CN* - copy-number

</font>
</details>

`r if ( runPurpleChunk && length(genes_losses) > 2000 ) { c(paste0("<span style=\"color:#ff0000\">NOTE</span>, the table was truncated to 2000 entries.")) } else { cat("") }`

***

### - Expression profiles {.tabset}

`r if ( exists("limit_genes") && exists("genes_gains_no") ) { if ( limit_genes ) { c(paste0("Expression profiles for ", genes_gains_no, " genes with the highest (**gains**) and ", genes_losses_no, " genes with the lowest (**losses**) CN values and the greatest difference in mRNA expression (percentile) values between patient's sample and the average mRNA expression in samples from cancer patients.")) } else { cat(" ") }}`

#### Gains {.tabset}

```{r cdf_plot_cn_expr_gains, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runPurpleChunk, results="asis"}
suppressMessages(library(plotly))
##### Generate empirical cumulative distribution function (ECDF) plot illustrating mRNA expression level for the genes of interest in the context of the overall mRNA expression distribution
output_cdf <- list()
output_counts <- list()
output_density <- list()
genes <- cn_expr_genes.expr.gains.perc[[2]]$SYMBOL

##### For each gene generate (1) CDF plot and add boxplot below to show the data variance for selected gene in individual groups, (2) bar-plot of read count data across all samples and (3) density plot to demonstrate expression distribution in investigated sample 
for( i in 1:genes_gains_no ) {
  if ( genes_gains_no > 0 && genes[i] %in% rownames(data) ) {
    
    ##### CDF plot
    output_cdf[[i]] <- cdfPlot(gene = genes[i], data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, addBoxPlot = TRUE, scaling = scaling, report_dir = results_dir)
    
    ##### Bar-plot of read counts
    ##### First map the gene symbol to Ensmebl ID (used in the counts data)
    genes.ENSEMBL <- ref_dataset.list[[dataset]][["gene_annot_all"]]$ENSEMBL[ ref_dataset.list[[dataset]][["gene_annot_all"]]$SYMBOL ==  genes[i] ]
    
    output_counts[[i]] <- barPlot(gene = genes.ENSEMBL, data = ref_dataset.list[[dataset]][["combined_data"]], y_title = "Counts", targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group )
    
    ##### Density plot - expression distribution
    output_density[[i]] <- densityPlot(gene = genes[i], data = data, main_title= "", x_title = "Z-score", sampleName = sample_name, distributions = c("normal", "bimodal"), scaling = scaling) 
  }
}

##### Now once the plots are ready show them in separate tabs
if ( genes_gains_no != 0 ) {
  for( i in 1:genes_gains_no ){
    if ( genes[i] %in% rownames(data) ) {
      cat("\n##### ", genes[i], "\n")
      cat(renderTags(output_cdf[[i]])$html)
      cat("\n<details>\n")
      cat("\n<summary>Plot legend</summary>\n")
      cat("<font size=\"2\">\n")
      cat(paste0("**Top panel**: distribution of percentile values (*y-axis*) as a function of expression levels (Z-scores, *x-axis*) for *", genes[i], "* in patient's sample (*black dot*) and other reference cancer cohort(s) (median value(s)).\n\n"))
      cat(paste0("**Bottom panel**: box-plot presenting expression level (Z-score) of *", genes[i], "* in patient's sample (*black dot*) and its expression levels observed across samples from other reference cancer cohort(s).\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Read counts</summary>\n")
      cat(renderTags(output_counts[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Bar-plot illustrating read counts for *", genes[i], "* across all samples. The *", genes[i], "* read count in patient's sample is indicated by *black bar*.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Expression distribution patterns</summary>\n")
      cat(renderTags(output_density[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Plot illustrating distribution of expression levels (Z-scores) of *", genes[i], "* *observed* across all samples along with simulated *normal* and *bimodal* distributions. The *", genes[i], "* expression level observed in patient's sample is indicated by *black dot* in each distribution.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n***\n")
    } else {
      cat("\n#### ", genes[i], "\n")
      cat("\n<span style=\"color:#ff0000\">NOTE</span>, expression data is not available for that gene.\n")
      cat("\n***\n")
    }
    #### Clear plots to free up some memory
    if(!is.null(dev.list())) invisible(dev.off())
  }
} else {
  cat("\nNo alterations were reported.\n")
   cat("\n***\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

##### Clean the space
rm(list = ls(pattern='^output*'))
```

***

#### Losses {.tabset}

```{r cdf_plot_cn_expr_losses, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, eval = runPurpleChunk, results="asis"}
suppressMessages(library(plotly))
##### Generate empirical cumulative distribution function (ECDF) plot illustrating mRNA expression level for the genes of interest in the context of the overall mRNA expression distribution
output_cdf <- list()
output_counts <- list()
output_density <- list()
genes <- cn_expr_genes.expr.losses.perc[[2]]$SYMBOL

##### For each gene generate (1) CDF plot and add boxplot below to show the data variance for selected gene in individual groups, (2) bar-plot of read count data across all samples and (3) density plot to demonstrate expression distribution in investigated sample 
for( i in 1:genes_losses_no ) {
  if ( genes_losses_no > 0 && genes[i] %in% rownames(data) ) {
    
    ##### CDF plot
    output_cdf[[i]] <- cdfPlot(gene = genes[i], data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, addBoxPlot = TRUE, scaling = scaling, report_dir = results_dir)
    
    ##### Bar-plot of read counts
    ##### First map the gene symbol to Ensmebl ID (used in the counts data)
    genes.ENSEMBL <- ref_dataset.list[[dataset]][["gene_annot_all"]]$ENSEMBL[ ref_dataset.list[[dataset]][["gene_annot_all"]]$SYMBOL ==  genes[i] ]
    
    output_counts[[i]] <- barPlot(gene = genes.ENSEMBL, data = ref_dataset.list[[dataset]][["combined_data"]], y_title = "Counts", targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group )
    
    ##### Density plot - expression distribution
    output_density[[i]] <- densityPlot(gene = genes[i], data = data, main_title= "", x_title = "Z-score", sampleName = sample_name, distributions = c("normal", "bimodal"), scaling = scaling)
  }
}

##### Now once the plots are ready show them in separate tabs
if ( genes_losses_no != 0 ) {
  for( i in 1:genes_losses_no ){
    if ( genes[i] %in% rownames(data) ) {
      cat("\n##### ", genes[i], "\n")
      cat(renderTags(output_cdf[[i]])$html)
      cat("\n<details>\n")
      cat("\n<summary>Plot legend</summary>\n")
      cat("<font size=\"2\">\n")
      cat(paste0("**Top panel**: distribution of percentile values (*y-axis*) as a function of expression levels (Z-scores, *x-axis*) for *", genes[i], "* in patient's sample (*black dot*) and other reference cancer cohort(s) (median value(s)).\n\n"))
      cat(paste0("**Bottom panel**: box-plot presenting expression level (Z-score) of *", genes[i], "* in patient's sample (*black dot*) and its expression levels observed across samples from other reference cancer cohort(s).\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Read counts</summary>\n")
      cat(renderTags(output_counts[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Bar-plot illustrating read counts for *", genes[i], "* across all samples. The *", genes[i], "* read count in patient's sample is indicated by *black bar*.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n<details>\n")
      cat("\n<summary>Expression distribution patterns</summary>\n")
      cat(renderTags(output_density[[i]])$html)
      cat("<font size=\"2\">\n")
      cat(paste0("Plot illustrating distribution of expression levels (Z-scores) of *", genes[i], "* *observed* across all samples along with simulated *normal* and *bimodal* distributions. The *", genes[i], "* expression level observed in patient's sample is indicated by *black dot* in each distribution.\n"))
      cat("\n</font>\n")
      cat("\n</details>\n")
      cat("\n***\n")
    } else {
      cat("\n#### ", genes[i], "\n")
      cat("\n<span style=\"color:#ff0000\">NOTE</span>, expression data is not available for that gene.\n")
      cat("\n***\n")
    }
  }
  #### Clear plots to free up some memory
  if(!is.null(dev.list())) invisible(dev.off())
  
} else {
  cat("\nNo alterations were reported.\n")
   cat("\n***\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

##### Clean the space
rm(list = ls(pattern='^output*'))
rm(limit_genes)
```

`r if ( !runPurpleChunk ) { c("***") } else { c(" ") }`

## Immune markers

Section presenting expression levels of immune markers to assess pre-existing anti-cancer immunity and likelihood of response to immunotherapy. Their mRNA expression levels are presented in patient's sample along their average mRNA expression in samples from cancer cohorts.

Out of the `r length(unique(unlist(ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL)))` immune markers the expression of **`r length(which(ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL %in% rownames(ref_dataset.list[[dataset]][["data_to_report"]])))`** was reliably measured in patient's sample. The remaining `r length(which(ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL %!in% rownames(ref_dataset.list[[dataset]][["data_to_report"]])))` genes are either not expressed or their expression level is too low to be detected (indicated in <span style="color:#808080">BLANK</span> cells with missing values).

### - Summary table {.tabset}

#### Percentiles

```{r immune_genes_table_perc, comment = NA, message=FALSE, warning=FALSE}
##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",Immune markers")
mysql_populate_update <- paste0(mysql_populate_update, ",Immune markers")

##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]
genes <- unique(unlist(ref_genes.list[["genes_immune"]]$immune_markers$SYMBOL))

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes) == 0 ) {
  genes <- NULL
}

immune_genes.expr.perc <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL", "Immune_Cycle_Role")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "perc", scaling = scaling)[[1]]

##### Present the expression summary table
immune_genes.expr.perc

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=immune_genes.expr.perc, file=paste(exprTableDir, "immune_genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(immune_genes.expr.perc)
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (percentile) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (percentile) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each immune marker. Genes are ordered by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column.

</font>
</details>

***

#### Z-scores

```{r immune_genes_table, comment = NA, message=FALSE, warning=FALSE}
##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
immune_genes.expr.z <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL", "Immune_Cycle_Role")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "z", scaling = scaling)[[1]]

##### Present the expression summary table
immune_genes.expr.z

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=immune_genes.expr.z, file=paste(exprTableDir, "immune_genes.expr.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(immune_genes.expr.z)
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (Z-score) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (Z-score) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each immune marker. Genes are ordered by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column.

</font>
</details>

***

### - Expression overview {.tabset}

Overview of immune markers expression profiles in patient's sample and in samples from cancer patients.

#### Percentiles

```{r glance_expr_plot_immune_genes_perc, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3}
suppressMessages(library(plotly))

##### Generate overview boxplot
if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "immune_genes", type = "perc", sort = "alphabetically", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for immune markers!\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

<details>
<summary>Plot legend</summary>
<font size="2">

The individual box(es) represent the `r if ( int_cancer_group == comp_cancer_group ) { paste0(int_cancer_group, " and") } else { cat("") }` `r ext_cancer_group` `r if ( !is.null(add_cancer_group) ) { paste0("and ", add_cancer_group) } else { cat("") }` reference cancer cohort(s), and the **BLACK** dots indicate expression (percentile) values for each gene in the patient sample. Genes are ordered  **alphabetically**.

</font>
</details>

***

#### Z-scores

```{r glance_expr_plot_immune_genes, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3}
suppressMessages(library(plotly))

##### Generate overview boxplot
if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "immune_genes", type = "z", sort = "alphabetically", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for immune markers!\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

<details>
<summary>Plot legend</summary>
<font size="2">

The individual box(es) represent the `r if ( int_cancer_group == comp_cancer_group ) { paste0(int_cancer_group, " and") } else { cat("") }` `r ext_cancer_group` `r if ( !is.null(add_cancer_group) ) { paste0("and ", add_cancer_group) } else { cat("") }` reference cancer cohort(s), and the **BLACK** dots indicate expression (Z-score) values for each gene in the patient sample. Genes are ordered  **alphabetically**.

</font>
</details>

***

`r if ( params$immunogram ) { c("### - Immunogram {.tabset}") }`

`r if ( params$immunogram ) { c("Visualisation of general and local cancer immunity status using **cancer immunogram** for the [cancer-immunity cycle](https://www.sciencedirect.com/topics/medicine-and-dentistry/tumor-immunity){target=\"_blank\"} (CIC), a concept of integrated immune biomarkers scoring system proposed by [Blank et al.](https://science.sciencemag.org/content/352/6286/658.summary){target=\"_blank\"}") }`

`r if ( params$immunogram ) { c("#### Plot") }`

```{r immunogram_plot, comment = NA, message=FALSE, warning=FALSE, fig.width = 6, fig.height = 6, eval = params$immunogram}
##### Generate spider web plot to present the patient cancer immunity status. For more info about immunogram see the following papers
# https://www.sciencedirect.com/science/article/pii/S1556086417300084
# https://www.sciencedirect.com/science/article/pii/S1556086417302125
# https://www.europeanurology.com/article/S0302-2838(18)30685-7/fulltext?rss=yes
##### NOTE: currently, the mean expression (Z-score) values of genes from each of the 7 CIC steps are presented rather than the normalized enrichment scores (NES) from GSEA analysis performed for each geneset (CIC step)

##### Preset cancer immunity status for the patient using web-plot
webplot(as.data.frame(ref_genes.list[["genes_immune"]]$immunogram.df), data.row = ncol(data), main = "", add = FALSE, col = "black")

##### Now add data for samples with specific immunogram patterns, e.g. T-cell–rich, T-cell–poor, T-cell–intermediate...
#webplot(as.data.frame(ref_genes.list[["genes_immune"]]$immunogram.df), data.row = 5, main = "", add = TRUE, col = "powderblue", lty = 5)
#webplot(as.data.frame(ref_genes.list[["genes_immune"]]$immunogram.df), data.row = 156, main = "", add = TRUE, col = "forestgreen", lty = 5)
#webplot(as.data.frame(ref_genes.list[["genes_immune"]]$immunogram.df), data.row = 194, main = "", add = TRUE, col = "red", lty = 5)
#legend("topright", legend=c("Patient", "T-cell–rich","T-cell–poor", "T-cell–intermediate"), fill=c("black", "powderblue", "forestgreen", "red"), bty="n", bg = "transparent", cex = 0.8)

#### Clear plots to free up some memory
#if(!is.null(dev.list())) invisible(dev.off())
```

`r if ( params$immunogram ) { c("***") }`

`r if ( params$immunogram ) { c("#### Table {.tabset}") }`

```{r immunogram_table_legend, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, eval = params$immunogram, results="asis"}
cat("\n<details>\n")
cat("\n<summary>Table legend</summary>\n")
cat("\n<font size=\"2\">\n")
cat("\nThe <span style=\"color:#ff0000\">RED</span> colour range indicate relatively **high expression** (Z-score) values and <span style=\"color:#0000ff\">BLUE</span> colour range indicate relatively **low expression** (Z-score) values in individual sample group. The <span style=\"color:#808080\">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each immunogram gene. Genes are ordered by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column.\n")
cat("\n</font>\n")
cat("\n</details>\n")
cat("\n***\n")
```

`r if ( params$immunogram ) { c("##### Percentiles") }`

```{r immunogram_table_perc, comment = NA, message=FALSE, warning=FALSE, eval = params$immunogram}
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]
genes <- unique(unlist(ref_genes.list[["genes_immune"]]$immunogram$SYMBOL))

##### Deal with no genes or when more than 10 genes are of interest
if ( length(genes) == 0 ) {
  genes <- NULL
}

##### Generate expression summary table for cancer genes from OncoKB and UMCCr (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
immunogram.expr.perc <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL", "CIC")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "perc", scaling = scaling)[[1]]

##### Present the expression summary table
immunogram.expr.perc

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=immunogram.expr.perc, file=paste(exprTableDir, "immunogram.expr.perc.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(immunogram.expr.perc)
```

```{r immunogram_table_legend_perc, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, eval = params$immunogram, results="asis"}
cat("\n<details>\n")
cat("\n<summary>Table legend</summary>\n")
cat("\n<font size=\"2\">\n")
cat("\nThe <span style=\"color:#ff0000\">RED</span> colour range indicate relatively **high expression** (percentile) values and <span style=\"color:#0000ff\">BLUE</span> colour range indicate relatively **low expression** (percentile) values in individual sample group. The <span style=\"color:#808080\">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each immunogram gene. Genes are ordered by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column.\n")
cat("\n</font>\n")
cat("\n</details>\n")
cat("\n***\n")
```

`r if ( params$immunogram ) { c("##### Z-scores") }`

```{r immunogram_table, comment = NA, message=FALSE, warning=FALSE, eval = params$immunogram}
##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
immunogram.expr.z <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL", "CIC")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "z", scaling = scaling)[[1]]

##### Present the expression summary table
immunogram.expr.z

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=immunogram.expr.z, file=paste(exprTableDir, "immunogram.expr.z.html", sep = "/"), selfcontained=TRUE)
}
```

## HRD genes

Section presenting expression levels of homologous recombination deficiency (HRD) genes to assess how many of these demonstrate low expression, which may indicate potential promoter methylation events. Their mRNA expression levels are presented in patient's sample along their average mRNA expression in samples from cancer cohorts.

Out of the `r length(unique(unlist(ref_genes.list[["genes_hrd"]]$SYMBOL)))` hrd genes the expression of **`r length(which(ref_genes.list[["summary"]]$HRD %in% rownames(ref_dataset.list[[dataset]][["data_to_report"]])))`** was reliably measured in patient's sample. The remaining `r length(which(ref_genes.list[["summary"]]$HRD %!in% rownames(ref_dataset.list[[dataset]][["data_to_report"]])))` genes are either not expressed or their expression level is too low to be detected (indicated in <span style="color:#808080">BLANK</span> cells with missing values).

### - Summary table {.tabset}

#### Percentiles

```{r hrd_genes_table_perc, comment = NA, message=FALSE, warning=FALSE}
##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",HRD genes")
mysql_populate_update <- paste0(mysql_populate_update, ",HRD genes")

##### Generate expression summary table for hrd genes from Richqrd
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]
genes <- unique(unlist(ref_genes.list[["genes_hrd"]]$SYMBOL))

##### Deal with no genes
if ( length(genes) == 0 ) {
  genes <- NULL
}

hrd_genes.expr.perc <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "perc", scaling = scaling)

##### Present the expression summary table
hrd_genes.expr.perc[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=hrd_genes.expr.perc[[1]], file=paste(exprTableDir, "hrd_genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
}
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (percentile) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (percentile) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each HRD gene. Genes are ordered by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column. *TSG* - tumour suppressor gene

</font>
</details>

***

#### Z-scores

```{r hrd_genes_table, comment = NA, message=FALSE, warning=FALSE}
##### Generate expression summary table for hrd genes from Richard
hrd_genes.expr.z <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], ext_links = TRUE, type = "z", scaling = scaling)

##### Present the expression summary table
hrd_genes.expr.z[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=hrd_genes.expr.z[[1]], file=paste(exprTableDir, "hrd_genes.expr.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(hrd_genes.expr.z)
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (Z-score) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (Z-score) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each HRD gene. Genes are ordered by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column.

</font>
</details>

***

### - Expression overview {.tabset}

Overview of HRD genes expression profiles in patient's sample and in samples from cancer patients.

#### Percentiles

```{r glance_expr_plot_hrd_genes_perc, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3}
suppressMessages(library(plotly))

##### Generate overview boxplot
if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "hrd_genes", type = "perc", sort = "alphabetically", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for HRD genes!\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

<details>
<summary>Plot legend</summary>
<font size="2">

The individual box(es) represent the `r if ( int_cancer_group == comp_cancer_group ) { paste0(int_cancer_group, " and") } else { cat("") }` `r ext_cancer_group` `r if ( !is.null(add_cancer_group) ) { paste0("and ", add_cancer_group) } else { cat("") }` reference cancer cohort(s), and the **BLACK** dots indicate expression (percentile) values for each gene in the patient sample. Genes are ordered  **alphabetically**.

</font>
</details>

***

#### Z-scores

```{r glance_expr_plot_hrd_genes, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3}
suppressMessages(library(plotly))

##### Generate overview boxplot
if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "hrd_genes", type = "z", sort = "alphabetically", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for HRD genes!\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

<details>
<summary>Plot legend</summary>
<font size="2">

The individual box(es) represent the `r if ( int_cancer_group == comp_cancer_group ) { paste0(int_cancer_group, " and") } else { cat("") }` `r ext_cancer_group` `r if ( !is.null(add_cancer_group) ) { paste0("and ", add_cancer_group) } else { cat("") }` reference cancer cohort(s), and the **BLACK** dots indicate expression (Z-score) values for each gene in the patient sample. Genes are ordered  **alphabetically**.

</font>
</details>

***

## Cancer genes

mRNA expression levels of cancer genes in patient's sample and their average mRNA expression in samples from cancer cohorts. These include genes reported in the following gene panels/resources *[UMCCR cancer genes](https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv){target="_blank"}*, *[OncoKB](http://oncokb.org/#/cancerGenes){target="_blank"}*, *[MSK-IMPACT](https://www.mskcc.org/msk-impact){target="_blank"}*, *[MSK-HEME](http://www.islh.org/Presentation_Upload/presentation_uploads/12_52_0900-Zehir.pdf){target="_blank"}*, *[Foundation One](https://www.foundationmedicine.com/genomic-testing/foundation-one-cdx){target="_blank"}*, *[Foundation One Heme](https://www.foundationmedicine.com/genomic-testing/foundation-one-heme){target="_blank"}*, *[Vogelstein](http://science.sciencemag.org/content/339/6127/1546.full){target="_blank"}* and *[Sanger Cancer Gene Census](https://www.sanger.ac.uk/science/data/cancer-gene-census){target="_blank"}* (CGC).

### - Summary table {.tabset}

Out of the `r nrow(ref_genes.list[["genes_cancer"]])` cancer genes the expression of **`r length(which(rownames(ref_genes.list[["genes_cancer"]]) %in% rownames(ref_dataset.list[[dataset]][["data_to_report"]])))`** was reliably measured in patient's sample. The remaining `r length(which(rownames(ref_genes.list[["genes_cancer"]]) %!in% rownames(ref_dataset.list[[dataset]][["data_to_report"]])))` genes are either not expressed or their expression level is too low to be detected (indicated in <span style="color:#808080">BLANK</span> cells with missing values).

#### Percentiles

```{r cancer_genes_table_perc, comment = NA, message=FALSE, warning=FALSE}
##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",Cancer genes,All genes")
mysql_populate_update <- paste0(mysql_populate_update, ",Cancer genes,All genes")

##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
targets <- ref_dataset.list[[dataset]][["sample_annot"]]
data <- ref_dataset.list[[dataset]][["data_to_report"]]
genes <- rownames(ref_genes.list[["genes_cancer"]])

##### Deal with no genes
if ( length(genes) == 0 ) {
  genes <- NULL
}

cancer_genes.expr.perc <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]], ext_links = TRUE, type = "perc", scaling = scaling)

##### Present the expression summary table
cancer_genes.expr.perc[[1]]

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cancer_genes.expr.perc[[1]], file=paste(exprTableDir, "cancer_genes.expr.perc.html", sep = "/"), selfcontained=TRUE)
}
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (percentile) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (percentile) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between percentiles in patient sample and reference cancer cohort for each cancer gene. Genes considered to be oncogenes or tumour suppressor genes, according to [OncoKB](http://oncokb.org/#/cancerGenes){target="_blank"} database, and inclusion in various sequencing panels are also indicated. Genes are ordered by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column. *TSG* - tumour suppressor gene

</font>
</details>

***

#### Z-scores

```{r cancer_genes_table, comment = NA, message=FALSE, warning=FALSE}
##### Generate expression summary table for cancer genes from OncoKB and UMCCR (https://github.com/vladsaveliev/NGS_Utils/blob/master/ngs_utils/reference_data/key_genes/umccr_cancer_genes.2019-03-20.tsv)
cancer_genes.expr.z <- exprTable( genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, genes_annot = ref_dataset.list[[dataset]][["gene_annot_all"]][, c("SYMBOL", "ENSEMBL")], oncokb_annot = ref_genes.list[["genes_oncokb"]], cancer_genes = ref_genes.list[["genes_cancer"]], ext_links = TRUE, type = "z", scaling = scaling)[[1]]

##### Present the expression summary table
cancer_genes.expr.z

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=cancer_genes.expr.z, file=paste(exprTableDir, "cancer_genes.expr.z.html", sep = "/"), selfcontained=TRUE)
}

##### Clean the space
rm(cancer_genes.expr.z)
```

<details>
<summary>Table legend</summary>
<font size="2">

The <span style="color:#ff0000">RED</span> colour range indicate relatively **high expression** (Z-score) values and <span style="color:#0000ff">BLUE</span> colour range indicate relatively **low expression** (Z-score) values in individual sample group. The <span style="color:#808080">BLANK</span> cells with missing values indicate genes with **no/low expression**. The **Diff** (**Patient vs `r comp_cancer_group`**) column illustrates the difference between Z-scores in patient sample and reference cancer cohort for each cancer gene. Genes considered to be oncogenes or tumour suppressor genes, according to [OncoKB](http://oncokb.org/#/cancerGenes){target="_blank"} database, and inclusion in various sequencing panels are also indicated. Genes are ordered by **decreasing** absolute values in the **Diff** (**Patient vs `r comp_cancer_group`**) column. *TSG* - tumour suppressor gene

</font>
</details>

***

### - Expression overview {.tabset}

Overview of expression profiles of 50 altered cancer genes with the greatest difference in mRNA expression (percentile) values between patient's sample and the average mRNA expression in samples from cancer patients.

#### Percentiles

```{r glance_expr_plot_cancer_genes_perc, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3}
suppressMessages(library(plotly))

##### Generate overview boxplot
genes <- cancer_genes.expr.perc[[2]]$SYMBOL[1:50]

if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "cancer_genes", type = "perc", sort = "none", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for cancer genes!\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

<details>
<summary>Plot legend</summary>
<font size="2">

The individual box(es) represent the `r if ( int_cancer_group == comp_cancer_group ) { paste0(int_cancer_group, " and") } else { cat("") }` `r ext_cancer_group` `r if ( !is.null(add_cancer_group) ) { paste0("and ", add_cancer_group) } else { cat("") }` reference cancer cohort(s), and the **BLACK** dots indicate expression (percentile) values for each gene in the patient sample. Genes are ordered by **decreasing** absolute values in the **Patient vs `r comp_cancer_group`** comparison.

</font>
</details>

***

#### Z-scores

```{r glance_expr_plot_cancer_genes, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3}
suppressMessages(library(plotly))

if ( !is.null(genes) ) {
  glanceExprPlot(genes = genes, data = data, targets = targets, sampleName = sample_name, ext_cancer = ext_cancer_group, int_cancer = int_cancer_group, comp_cancer = comp_cancer_group, add_cancer = add_cancer_group, hexcode = "cancer_genes", type = "z", sort = "none", scaling = scaling, report_dir = results_dir)
} else {
  cat("\nNo expression data is available for cancer genes!\n")
}

##### Detach plotly package. Otherwise it clashes with other graphics devices
detach("package:plotly", unload=FALSE)

#### Clear plots to free up some memory
if(!is.null(dev.list())) invisible(dev.off())
```

<details>
<summary>Plot legend</summary>
<font size="2">

The individual box(es) represent the `r if ( int_cancer_group == comp_cancer_group ) { paste0(int_cancer_group, " and") } else { cat("") }` `r ext_cancer_group` `r if ( !is.null(add_cancer_group) ) { paste0("and ", add_cancer_group) } else { cat("") }` reference cancer cohort(s), and the **BLACK** dots indicate expression (Z-score) values for each gene in the patient sample. Genes are ordered by **decreasing** absolute values in the **Patient vs `r comp_cancer_group`** comparison.

</font>
</details>

***

`r if ( params$drugs ) { c("## Drug matching {.tabset}") }`

`r if ( params$drugs ) { c("List of drugs targeting variants in detected [Mutated genes], [Fusion genes], [Structural variants]-affected genes, [CN altered genes], [HRD genes] and dysregulated [Cancer genes], which can be considered in the treatment decision making process. The clinically actionable aberrations are matched based on information provided by *[clinical interpretations of variants in Cancer](https://civicdb.org/home){target=\"_blank\"}* (CIViC) ([Griffith et al. (2017)](https://www.ncbi.nlm.nih.gov/pubmed/28138153){target=\"_blank\"}). The evidence pertaining to variants effect on therapeutic response is also provided.") }`

```{r drugs_table_dir, comment = NA, message=FALSE, warning=FALSE}
##### Create directory for tables
drugsTableDir <- paste(results_dir, "drugsTables", sep = "/")
if ( !file.exists(drugsTableDir) ) {
  dir.create(drugsTableDir, recursive=TRUE)
}
```

`r if ( params$drugs ) { c("### - Mutated genes -") }`

`r if ( params$drugs && !runPcgrChunk ) { c("Mutation data for this sample is **NOT AVAILABLE**.") }`

`r if ( params$drugs && runPcgrChunk ) { paste0("**", length(mut_genes.expr.perc[[2]]$SYMBOL), "** genes with [PCGR](https://github.com/sigven/pcgr){target=\"_blank\"} [tier](https://pcgr.readthedocs.io/en/latest/tier_systems.html#tier-model-2-pcgr-acmg){target=\"_blank\"} 1-", params$pcgr_tier, " variants were screened for suitable drugs (see [Mutated genes] section).") } else if ( params$drugs && !runPcgrChunk ) { paste0("**0** genes with [PCGR](https://github.com/sigven/pcgr){target=\"_blank\"} [tier](https://pcgr.readthedocs.io/en/latest/tier_systems.html#tier-model-2-pcgr-acmg){target=\"_blank\"} 1-", params$pcgr_tier, " variants were screened for suitable drugs (see [Mutated genes] section).") }`

```{r drugs_predictive_mut_genes, comment = NA, message=FALSE, warning=FALSE, eval = runPcgrChunk}
##### Generate table with drugs targeting mutated cancer genes
genes <- mut_genes.expr.perc[[2]]$SYMBOL

drugsTable.mut_genes <- civicDrugTable(genes, civic_var_summaries = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "mutation")

if ( params$drugs ) {
  drugsTable.mut_genes[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.mut_genes[[1]], file=paste(drugsTableDir, "drugsTable.mut_genes.html", sep = "/"), selfcontained=TRUE)
}
```

`r if ( params$drugs ) { c("<details>\n<summary>Table legend</summary>\n<font size=\"2\">") }`

```{r drugsTable_legend_mut_genes, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, results="asis", eval=params$drugs}
drugsTable_legend <- c("**[Evidence Level](https://civicdb.org/help/evidence/evidence-levels){target=\"_blank\"}**

* **A - Validated association**: Proven/consensus association in human medicine
* **B - Clinical evidence**: Clinical trial or other primary patient data supports association
* **C - Case study**: Individual case reports from clinical journals
* **D - Preclinical evidence**: *In vivo* or *in vitro* models support association
* **E - Inferential association**: Indirect evidence

**[Trust Rating](https://civicdb.org/help/evidence/trust-ratings){target=\"_blank\"}**

* **5**: Strong, well supported evidence from a lab or journal with respected academic standing. Experiments are well controlled, and results are clean and reproducible across multiple replicates. Evidence confirmed using independent methods. The study is statistically well powered
* **4**: Strong, well supported evidence. Experiments are well controlled, and results are convincing. Any discrepancies from expected results are well-explained and not concerning
* **3**: Evidence is convincing, but not supported by a breadth of experiments. May be smaller scale projects, or novel results without many follow-up experiments. Discrepancies from expected results are explained and not concerning
* **2**: Evidence is not well supported by experimental data, and little follow-up data is available. Publication is from a journal with low academic impact. Experiments may lack proper controls, have small sample size, or are not statistically convincing
* **1**: Claim is not supported well by experimental evidence. Results are not reproducible, or have very small sample size. No follow-up is done to validate novel claims

**[Actionability Score ](https://civicdb.org/help/variants/actionability-score){target=\"_blank\"}**

* [CIViC Actionability Score](https://civicdb.org/help/variants/actionability-score){target=\"_blank\"} allows to assess the accumulation of evidence for each variant. It is calculated by adding all Evidence Item Scores for each variant. The Evidence Item Score is calculated by multiplying the evidence level (A=10 points, B=5 points, C=3 points, D=1 point, E=0.25 points) by the trust rating (each Star = 1 point).")

cat(drugsTable_legend)
```

`r if ( params$drugs ) { c("</font>\n</details>") }`

`r if ( params$drugs ) { c("***") }`

`r if ( params$drugs ) { c("### - Fusion genes -") }`

`r if ( params$drugs && !runFusionChunk ) { c("Fusion genes information for this sample is **NOT AVAILABLE**.") }`

`r if ( params$drugs && runFusionChunk ) { paste0("<span style=\"color:#ff0000\">**", nrow(fusions[ fusions$geneA_dna_support == "Yes" | fusions$geneB_dna_support == "Yes" , ]), "**</span> **DNA-supported** fusion genes (see [Structural variants] section) and <span style=\"color:#02d653\">**", nrow(fusions[ fusions$reported_fusion == "Yes" , ]), "**</span> gene fusions **reported in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target=\"_blank\"}** were screened for suitable drugs.") } else if ( params$drugs && !runFusionChunk ) { paste0("<span style=\"color:#ff0000\">**0**</span> involving **DNA-supported** fusion genes (see [Structural variants] section) and <span style=\"color:#02d653\">**0**</span> gene fusions **reported in [FusionGDB](https://ccsm.uth.edu/FusionGDB){target=\"_blank\"}** were screened for suitable drugs.") }`

```{r drugs_predictive_fusion_genes, comment = NA, message=FALSE, warning=FALSE, eval = runFusionChunk}
##### Generate table with drugs targeting fusion genes
genesA <- as.vector(fusions[ fusion_annot$reported_fusion == "Yes" | fusion_annot$geneA_dna_support == "Yes" | fusion_annot$geneB_dna_support == "Yes", ]$geneA)
genesB <- as.vector(fusions[ fusion_annot$reported_fusion == "Yes" | fusion_annot$geneA_dna_support == "Yes" | fusion_annot$geneB_dna_support == "Yes", ]$geneB)

drugsTable.fusion_genes <- civicDrugTable(genes = unique(c(genesA, genesB)), civic_var_summaries  = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid  = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "fusion")

if ( params$drugs ) {
  drugsTable.fusion_genes[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.fusion_genes[[1]], file=paste(drugsTableDir, "drugsTable.fusion_genes.html", sep = "/"), selfcontained=TRUE)
}
```

`r if ( params$drugs ) { c("<details>\n<summary>Table legend</summary>\n<font size=\"2\">") }`

```{r drugsTable_legend_fusion_genes, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, results="asis", eval=params$drugs}
cat(drugsTable_legend)
```

`r if ( params$drugs ) { c("</font>\n</details>") }`

`r if ( params$drugs ) { c("***") }`

`r if ( params$drugs ) { c("### - Structural variants -") }`

`r if ( params$drugs && !runSVsChunk ) { c("SVs information for this sample is **NOT AVAILABLE**.") }`

`r if ( params$drugs && runSVsChunk ) { paste0("**", length(unique(manta_sv$Gene)), "** genes affected by structural variants (SVs) were screened for suitable drugs (see [Structural variants] section).") } else if ( params$drugs && !runSVsChunk ) { paste0("**0** genes affected by structural variants (SVs) were screened for suitable drugs (see [Structural variants] section).") }`

```{r drugs_predictive_sv_genes, comment = NA, message=FALSE, warning=FALSE, eval = runSVsChunk}
##### Generate table with drugs targeting dysregulated cancer genes
genes <- unique(manta_sv$Gene)

drugsTable.sv_genes <- civicDrugTable(genes, civic_var_summaries  = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid  = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = NULL)

if ( params$drugs ) {
  drugsTable.sv_genes[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.sv_genes[[1]], file=paste(drugsTableDir, "drugsTable.sv_genes.html", sep = "/"), selfcontained=TRUE)
}
```

`r if ( params$drugs ) { c("<details>\n<summary>Table legend</summary>\n<font size=\"2\">") }`

```{r drugsTable_legend_sv_genes, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, results="asis", eval=params$drugs}
cat(drugsTable_legend)
```

`r if ( params$drugs ) { c("</font>\n</details>") }`

`r if ( params$drugs ) { c("***") }`

`r if ( params$drugs ) { c("### - CN altered genes - {.tabset}") }`

`r if ( params$drugs && !runPurpleChunk ) { c("CN information for this sample is **NOT AVAILABLE**.") }`

`r if ( params$drugs && runPurpleChunk ) { paste0("**", length(cn_genes), "** genes with CN values >= ", cn_top, "  (**gains**) or =< ", cn_bottom, " (**losses**) were screened for suitable drugs (see [CN altered genes] section).") } else if ( params$drugs && !runPurpleChunk ) { paste0("**0** genes were affected by CN changes (see [CN altered genes] section).") }`

`r if ( params$drugs ) { c("#### Gains") }`

```{r drugs_predictive_cn_altered_genes_gains, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
##### Generate table with drugs targeting CN altered genes
genes <- cn_expr_genes.expr.gains.perc[[2]]$SYMBOL

drugsTable.CN_altered_genes_gains <- civicDrugTable(genes, civic_var_summaries  = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid  = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "copy_gain")

if ( params$drugs ) {
  drugsTable.CN_altered_genes_gains[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.CN_altered_genes_gains[[1]], file=paste(drugsTableDir, "drugsTable.CN_altered_genes_gains.html", sep = "/"), selfcontained=TRUE)
}
```

`r if ( params$drugs ) { c("<details>\n<summary>Table legend</summary>\n<font size=\"2\">") }`

```{r drugsTable_legend_cn_gains_genes, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, results="asis", eval=params$drugs}
cat(drugsTable_legend)
```

`r if ( params$drugs ) { c("</font>\n</details>") }`

`r if ( params$drugs ) { c("***") }`

`r if ( params$drugs ) { c("#### Losses") }`

```{r drugs_predictive_cn_altered_genes_losses, comment = NA, message=FALSE, warning=FALSE, eval = runPurpleChunk}
##### Generate table with drugs targeting CN altered genes
genes <- cn_expr_genes.expr.losses.perc[[2]]$SYMBOL

drugsTable.CN_altered_genes_losses <- civicDrugTable(genes, civic_var_summaries  = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid  = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "copy_loss")

if ( params$drugs ) {
  drugsTable.CN_altered_genes_losses[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.CN_altered_genes_losses[[1]], file=paste(drugsTableDir, "drugsTable.CN_altered_genes_losses.html", sep = "/"), selfcontained=TRUE)
}
```

`r if ( params$drugs ) { c("<details>\n<summary>Table legend</summary>\n<font size=\"2\">") }`

```{r drugsTable_legend_cn_losses_genes, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, results="asis", eval=params$drugs}
cat(drugsTable_legend)
```

`r if ( params$drugs ) { c("</font>\n</details>") }`

`r if ( params$drugs ) { c("***") }`

`r if ( params$drugs ) { c("### - HRD genes -") }`

`r if ( params$drugs ) { paste0("**", length(which(hrd_genes.expr.perc[[2]]$SYMBOL %in% rownames(ref_dataset.list[[dataset]][["data_to_report"]]))), "** reliably measured [HRD genes] were screened for suitable drugs (see [HRD genes] section).") }`

```{r drugs_predictive_hrd_genes, comment = NA, message=FALSE, warning=FALSE}
##### Generate table with drugs targeting mutated cancer genes
genes <- hrd_genes.expr.perc[[2]]$SYMBOL[ hrd_genes.expr.perc[[2]]$SYMBOL %in% rownames(ref_dataset.list[[dataset]][["data_to_report"]]) ]

drugsTable.hrd_genes <- civicDrugTable(genes, civic_var_summaries = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "mutation")

if ( params$drugs ) {
  drugsTable.hrd_genes[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.hrd_genes[[1]], file=paste(drugsTableDir, "drugsTable.hrd_genes.html", sep = "/"), selfcontained=TRUE)
}
```

`r if ( params$drugs ) { c("<details>\n<summary>Table legend</summary>\n<font size=\"2\">") }`

```{r drugsTable_legend_hrd_genes, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, results="asis", eval=params$drugs}
cat(drugsTable_legend)
```

`r if ( params$drugs ) { c("</font>\n</details>") }`

`r if ( params$drugs ) { c("***") }`

`r if ( params$drugs ) { c("### - Cancer genes -") }`

`r if ( params$drugs ) { paste0("**50** cancer genes with the greatest difference in standarised (Z-score) mRNA expression values between patient's sample and the average mRNA expression in samples from cancer patients were screened for suitable drugs (see [Cancer genes] section).") }`

```{r drugs_predictive_cancer_genes, comment = NA, message=FALSE, warning=FALSE}
##### Update MySQL commend to populate RNA-seq data portal
mysql_populate <- paste0(mysql_populate, ",Drug matching")
mysql_populate_update <- paste0(mysql_populate_update, ",Drug matching")

##### Generate table with drugs targeting dysregulated cancer genes
genes <- cancer_genes.expr.perc[[2]]$SYMBOL[1:50]

drugsTable.cancer_genes <- civicDrugTable(genes, civic_var_summaries  = caner_genes_annot.list[["civic_var_summaries"]], civic_clin_evid  = caner_genes_annot.list[["civic_clin_evid"]],  evid_type = "Predictive", var_type = "expression")

if ( params$drugs ) {
  drugsTable.cancer_genes[[1]]
}

##### Save the expression table as html file
if ( params$save_tables ) {
  saveWidgetFix(widget=drugsTable.cancer_genes[[1]], file=paste(drugsTableDir, "drugsTable.cancer_genes.html", sep = "/"), selfcontained=TRUE)
}
```

`r if ( params$drugs ) { c("<details>\n<summary>Table legend</summary>\n<font size=\"2\">") }`

```{r drugsTable_legend_cancer_genes, echo=FALSE, comment = NA, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 3, results="asis", eval=params$drugs}
cat(drugsTable_legend)
```

`r if ( params$drugs ) { c("</font>\n</details>") }`

`r if ( params$drugs ) { c("***") }`

```{r mysql_populate_finalise, message=FALSE, warning=FALSE}
##### Finalise and write into a file the MySQL commend to populate RNA-seq data portal
##### Add input data info
mysql_populate <- paste0(mysql_populate, ",Input data")
mysql_populate_update <- paste0(mysql_populate_update, ",Input data")

##### Add clinical data if available
if (runClinicalChunk ) {
  mysql_populate <- paste0(mysql_populate, ",Clinical information")
  mysql_populate_update <- paste0(mysql_populate_update, ",Clinical information")
}

mysql_populate <- paste0(mysql_populate, "\", \"Transcriptome summary for sample ", sample_name, " generated on ", Sys.Date(), "\"", ", \"", Sys.Date(), "\" )")
mysql_populate_update <- paste0(mysql_populate_update, "\", Summary=\"Transcriptome summary for sample ", sample_name, " generated on ", Sys.Date(), "\"", ", Date=\"", Sys.Date(), "\";")
mysql_populate <- paste0(mysql_populate, "\n  ", mysql_populate_update, "\nSET @ID := 0;\nUPDATE RNAseq_reports SET ID = ( SELECT @ID := @ID + 1 );")
writeLines(mysql_populate, con = paste0(results_dir, "/", sample_name, ".RNAseq_report.sql"))
```

## Addendum

<details>
<summary>Parameters</summary>
<font size="2">

```{r params_info, comment = NA}
for ( i in 1:length(params) ) {
  cat(paste("Parameter: ", names(params)[i], "\nValue: ", paste(unlist(params[i]), collapse = ","), "\n\n", sep=""))
}
```

</font>
</details>

<details>
<summary>Reporter details</summary>
<font size="2">

```{r reporter_details, comment = NA}
cat(paste0("The report was generated by \"", Sys.info()[ "user"], "\" using \"",  Sys.info()[ "nodename"], "\" node and \"",  Sys.info()[ "sysname"], "\" operating system."))
```

</font>
</details>

<details>
<summary>Session information</summary>
<font size="2">

```{r session_info, comment = NA}
devtools::session_info()
```

</font>
</details>
